|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package org.apache.spark.streaming.receiver |
| 19 | + |
| 20 | +import scala.concurrent.{Await, ExecutionContext, Future} |
| 21 | +import scala.concurrent.duration._ |
| 22 | +import scala.language.{existentials, postfixOps} |
| 23 | + |
| 24 | +import WriteAheadLogBasedBlockHandler._ |
| 25 | +import org.apache.hadoop.conf.Configuration |
| 26 | +import org.apache.hadoop.fs.Path |
| 27 | + |
| 28 | +import org.apache.spark.{Logging, SparkConf, SparkException} |
| 29 | +import org.apache.spark.storage._ |
| 30 | +import org.apache.spark.streaming.util.{Clock, SystemClock, WriteAheadLogFileSegment, WriteAheadLogManager} |
| 31 | +import org.apache.spark.util.Utils |
| 32 | + |
| 33 | +/** Trait that represents the metadata related to storage of blocks */ |
| 34 | +private[streaming] trait ReceivedBlockStoreResult { |
| 35 | + def blockId: StreamBlockId // Any implementation of this trait will store a block id |
| 36 | +} |
| 37 | + |
| 38 | +/** Trait that represents a class that handles the storage of blocks received by receiver */ |
| 39 | +private[streaming] trait ReceivedBlockHandler { |
| 40 | + |
| 41 | + /** Store a received block with the given block id and return related metadata */ |
| 42 | + def storeBlock(blockId: StreamBlockId, receivedBlock: ReceivedBlock): ReceivedBlockStoreResult |
| 43 | + |
| 44 | + /** Cleanup old blocks older than the given threshold time */ |
| 45 | + def cleanupOldBlock(threshTime: Long) |
| 46 | +} |
| 47 | + |
| 48 | + |
| 49 | +/** |
| 50 | + * Implementation of [[org.apache.spark.streaming.receiver.ReceivedBlockStoreResult]] |
| 51 | + * that stores the metadata related to storage of blocks using |
| 52 | + * [[org.apache.spark.streaming.receiver.BlockManagerBasedBlockHandler]] |
| 53 | + */ |
| 54 | +private[streaming] case class BlockManagerBasedStoreResult(blockId: StreamBlockId) |
| 55 | + extends ReceivedBlockStoreResult |
| 56 | + |
| 57 | + |
| 58 | +/** |
| 59 | + * Implementation of a [[org.apache.spark.streaming.receiver.ReceivedBlockHandler]] which |
| 60 | + * stores the received blocks into a block manager with the specified storage level. |
| 61 | + */ |
| 62 | +private[streaming] class BlockManagerBasedBlockHandler( |
| 63 | + blockManager: BlockManager, storageLevel: StorageLevel) |
| 64 | + extends ReceivedBlockHandler with Logging { |
| 65 | + |
| 66 | + def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = { |
| 67 | + val putResult: Seq[(BlockId, BlockStatus)] = block match { |
| 68 | + case ArrayBufferBlock(arrayBuffer) => |
| 69 | + blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel, tellMaster = true) |
| 70 | + case IteratorBlock(iterator) => |
| 71 | + blockManager.putIterator(blockId, iterator, storageLevel, tellMaster = true) |
| 72 | + case ByteBufferBlock(byteBuffer) => |
| 73 | + blockManager.putBytes(blockId, byteBuffer, storageLevel, tellMaster = true) |
| 74 | + case o => |
| 75 | + throw new SparkException( |
| 76 | + s"Could not store $blockId to block manager, unexpected block type ${o.getClass.getName}") |
| 77 | + } |
| 78 | + if (!putResult.map { _._1 }.contains(blockId)) { |
| 79 | + throw new SparkException( |
| 80 | + s"Could not store $blockId to block manager with storage level $storageLevel") |
| 81 | + } |
| 82 | + BlockManagerBasedStoreResult(blockId) |
| 83 | + } |
| 84 | + |
| 85 | + def cleanupOldBlock(threshTime: Long) { |
| 86 | + // this is not used as blocks inserted into the BlockManager are cleared by DStream's clearing |
| 87 | + // of BlockRDDs. |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | + |
| 92 | +/** |
| 93 | + * Implementation of [[org.apache.spark.streaming.receiver.ReceivedBlockStoreResult]] |
| 94 | + * that stores the metadata related to storage of blocks using |
| 95 | + * [[org.apache.spark.streaming.receiver.WriteAheadLogBasedBlockHandler]] |
| 96 | + */ |
| 97 | +private[streaming] case class WriteAheadLogBasedStoreResult( |
| 98 | + blockId: StreamBlockId, |
| 99 | + segment: WriteAheadLogFileSegment |
| 100 | + ) extends ReceivedBlockStoreResult |
| 101 | + |
| 102 | + |
| 103 | +/** |
| 104 | + * Implementation of a [[org.apache.spark.streaming.receiver.ReceivedBlockHandler]] which |
| 105 | + * stores the received blocks in both, a write ahead log and a block manager. |
| 106 | + */ |
| 107 | +private[streaming] class WriteAheadLogBasedBlockHandler( |
| 108 | + blockManager: BlockManager, |
| 109 | + streamId: Int, |
| 110 | + storageLevel: StorageLevel, |
| 111 | + conf: SparkConf, |
| 112 | + hadoopConf: Configuration, |
| 113 | + checkpointDir: String, |
| 114 | + clock: Clock = new SystemClock |
| 115 | + ) extends ReceivedBlockHandler with Logging { |
| 116 | + |
| 117 | + private val blockStoreTimeout = conf.getInt( |
| 118 | + "spark.streaming.receiver.blockStoreTimeout", 30).seconds |
| 119 | + private val rollingInterval = conf.getInt( |
| 120 | + "spark.streaming.receiver.writeAheadLog.rollingInterval", 60) |
| 121 | + private val maxFailures = conf.getInt( |
| 122 | + "spark.streaming.receiver.writeAheadLog.maxFailures", 3) |
| 123 | + |
| 124 | + // Manages rolling log files |
| 125 | + private val logManager = new WriteAheadLogManager( |
| 126 | + checkpointDirToLogDir(checkpointDir, streamId), |
| 127 | + hadoopConf, rollingInterval, maxFailures, |
| 128 | + callerName = this.getClass.getSimpleName, |
| 129 | + clock = clock |
| 130 | + ) |
| 131 | + |
| 132 | + // For processing futures used in parallel block storing into block manager and write ahead log |
| 133 | + // # threads = 2, so that both writing to BM and WAL can proceed in parallel |
| 134 | + implicit private val executionContext = ExecutionContext.fromExecutorService( |
| 135 | + Utils.newDaemonFixedThreadPool(2, this.getClass.getSimpleName)) |
| 136 | + |
| 137 | + /** |
| 138 | + * This implementation stores the block into the block manager as well as a write ahead log. |
| 139 | + * It does this in parallel, using Scala Futures, and returns only after the block has |
| 140 | + * been stored in both places. |
| 141 | + */ |
| 142 | + def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = { |
| 143 | + |
| 144 | + // Serialize the block so that it can be inserted into both |
| 145 | + val serializedBlock = block match { |
| 146 | + case ArrayBufferBlock(arrayBuffer) => |
| 147 | + blockManager.dataSerialize(blockId, arrayBuffer.iterator) |
| 148 | + case IteratorBlock(iterator) => |
| 149 | + blockManager.dataSerialize(blockId, iterator) |
| 150 | + case ByteBufferBlock(byteBuffer) => |
| 151 | + byteBuffer |
| 152 | + case _ => |
| 153 | + throw new Exception(s"Could not push $blockId to block manager, unexpected block type") |
| 154 | + } |
| 155 | + |
| 156 | + // Store the block in block manager |
| 157 | + val storeInBlockManagerFuture = Future { |
| 158 | + val putResult = |
| 159 | + blockManager.putBytes(blockId, serializedBlock, storageLevel, tellMaster = true) |
| 160 | + if (!putResult.map { _._1 }.contains(blockId)) { |
| 161 | + throw new SparkException( |
| 162 | + s"Could not store $blockId to block manager with storage level $storageLevel") |
| 163 | + } |
| 164 | + } |
| 165 | + |
| 166 | + // Store the block in write ahead log |
| 167 | + val storeInWriteAheadLogFuture = Future { |
| 168 | + logManager.writeToLog(serializedBlock) |
| 169 | + } |
| 170 | + |
| 171 | + // Combine the futures, wait for both to complete, and return the write ahead log segment |
| 172 | + val combinedFuture = for { |
| 173 | + _ <- storeInBlockManagerFuture |
| 174 | + fileSegment <- storeInWriteAheadLogFuture |
| 175 | + } yield fileSegment |
| 176 | + val segment = Await.result(combinedFuture, blockStoreTimeout) |
| 177 | + WriteAheadLogBasedStoreResult(blockId, segment) |
| 178 | + } |
| 179 | + |
| 180 | + def cleanupOldBlock(threshTime: Long) { |
| 181 | + logManager.cleanupOldLogs(threshTime) |
| 182 | + } |
| 183 | + |
| 184 | + def stop() { |
| 185 | + logManager.stop() |
| 186 | + } |
| 187 | +} |
| 188 | + |
| 189 | +private[streaming] object WriteAheadLogBasedBlockHandler { |
| 190 | + def checkpointDirToLogDir(checkpointDir: String, streamId: Int): String = { |
| 191 | + new Path(checkpointDir, new Path("receivedData", streamId.toString)).toString |
| 192 | + } |
| 193 | +} |
0 commit comments