Rename: primary key to sort key

cigolabs · Aug 27, 2015 · e9af388 · e9af388
1 parent dcd9a97
commit e9af388
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 23 deletions.
diff --git a/core/src/main/scala/filodb.core/datastore2/ChunkRowMap.scala b/core/src/main/scala/filodb.core/datastore2/ChunkRowMap.scala
@@ -22,10 +22,10 @@ trait ChunkRowMap {
  * Used during the columnar chunk flush process to quickly update a rowIndex, and merge it with what exists
  * on disk already
  */
-class UpdatableChunkRowMap[K : PrimaryKeyHelper] extends ChunkRowMap {
+class UpdatableChunkRowMap[K : SortKeyHelper] extends ChunkRowMap {
   import Types._
 
-  implicit val ordering = implicitly[PrimaryKeyHelper[K]].ordering
+  implicit val ordering = implicitly[SortKeyHelper[K]].ordering
   var index = TreeMap[K, (ChunkID, Int)]()
 
   def update(key: K, chunkID: ChunkID, rowNum: Int): Unit = {

diff --git a/core/src/main/scala/filodb.core/datastore2/NewApi.scala b/core/src/main/scala/filodb.core/datastore2/NewApi.scala
@@ -25,6 +25,12 @@ object Types {
   type PartitionKey = String
 }
 
+// A range of keys, used for describing ingest rows as well as queries
+case class KeyRange[K : SortKeyHelper](dataset: Types.TableName,
+                                       partition: Types.PartitionKey,
+                                       start: K, end: K)
+
+
 /**
  * The MemTable serves these purposes:
  * 1) Holds incoming rows of data before being flushed
@@ -92,7 +98,7 @@ trait ColumnStore {
    * @param version the version # to write the segment to
    * @returns Success, or other ErrorResponse
    */
-  def appendSegment[K : PrimaryKeyHelper](segment: Segment[K], version: Int): Future[Response]
+  def appendSegment[K : SortKeyHelper](segment: Segment[K], version: Int): Future[Response]
 
   /**
    * Reads segments from the column store, in order of primary key.

diff --git a/core/src/main/scala/filodb.core/datastore2/Segment.scala b/core/src/main/scala/filodb.core/datastore2/Segment.scala
@@ -15,7 +15,7 @@ trait Segment[K] {
   val keyRange: KeyRange[K]
   val index: ChunkRowMap
 
-  protected val helper: PrimaryKeyHelper[K]
+  protected val helper: SortKeyHelper[K]
   def segmentId: ByteBuffer = helper.toBytes(keyRange.start)
   def dataset: TableName    = keyRange.dataset
   def partition: PartitionKey = keyRange.partition
@@ -28,11 +28,11 @@ trait Segment[K] {
   def getColumns: collection.Set[String]
 }
 
-class GenericSegment[K : PrimaryKeyHelper](val keyRange: KeyRange[K],
+class GenericSegment[K : SortKeyHelper](val keyRange: KeyRange[K],
                                            val index: ChunkRowMap) extends Segment[K] {
   import Types._
 
-  protected val helper = implicitly[PrimaryKeyHelper[K]]
+  protected val helper = implicitly[SortKeyHelper[K]]
 
   val chunkIds = ArrayBuffer[ChunkID]()
   val chunks = new HashMap[String, HashMap[ChunkID, Chunk]]

diff --git a/...a/filodb.core/datastore2/PrimaryKey.scala → ...ilodb.core/datastore2/SortKeyHelper.scala b/...a/filodb.core/datastore2/PrimaryKey.scala → ...ilodb.core/datastore2/SortKeyHelper.scala
@@ -4,19 +4,19 @@ import java.nio.ByteBuffer
 import scala.math.Ordering
 
 /**
- * Definitions for primary keys and key ranges.
- * Primary keys must be linearizable so all the data in a partition can be evenly divisible into segments.
+ * Definitions for sort keys and key ranges.
+ * Sort keys must be linearizable so all the data in a partition can be evenly divisible into segments.
  * For more info, see [[doc/sorted_chunk_merge.md]].
  */
 
 /**
- * A typeclass for working with primary keys.
+ * A typeclass for working with sort keys.
  */
-trait PrimaryKeyHelper[K] {
+trait SortKeyHelper[K] {
   def ordering: Ordering[K]    // must be comparable
 
   /**
-   * Returns the inclusive start and exclusive end keys for the segment corresponding to a primary key.
+   * Returns the inclusive start and exclusive end keys for the segment corresponding to a sort key.
    * Must return the same start and end for all keys within [start, end) of a segment.
    */
   def getSegment(key: K): (K, K)
@@ -28,7 +28,7 @@ trait PrimaryKeyHelper[K] {
 /**
  * A typeclass for a timestamp based on a Long = milliseconds since Epoch
  */
-case class TimestampKeyHelper(intervalMs: Long) extends PrimaryKeyHelper[Long] {
+case class TimestampKeyHelper(intervalMs: Long) extends SortKeyHelper[Long] {
   def ordering: Ordering[Long] = Ordering.Long
   def getSegment(key: Long): (Long, Long) = {
     val segmentNum = key / intervalMs
@@ -38,9 +38,3 @@ case class TimestampKeyHelper(intervalMs: Long) extends PrimaryKeyHelper[Long] {
   def fromBytes(bytes: ByteBuffer): Long = bytes.getLong
 }
 
-// A range of keys, used for describing ingest rows as well as queries
-case class KeyRange[K : PrimaryKeyHelper](dataset: Types.TableName,
-                                          partition: Types.PartitionKey,
-                                          start: K, end: K)
-
-
diff --git a/doc/sorted_chunk_merge.md b/doc/sorted_chunk_merge.md
@@ -5,9 +5,9 @@ One challenge in FiloDB is how to continuously insert fresh data into a column s
 ### Columnar Chunking Defined
 
 * The dataset is divided into partitions.  A single partition must fit entirely into one node.  One node will probably contain many partitions.
-* There is a **primary key**, which must be linearizable and divisible into equal **segments**.  For example, if the primary key is timestamp, one segment might be 10 seconds of data.
-* Within a single partition, the storage engine (say Cassandra) is responsible for keeping the segments in primary key sort order.
-* Within a single segment, for a single column, the data is grouped into columnar chunks.  Each chunk is not aware of the primary key, it just knows it has data for a single column as a binary vector.  The chunk is the smallest unit of data read or written.
+* There is a **sort key**, which must be linearizable and divisible into equal **segments**.  For example, if the sort key is timestamp, one segment might be 10 seconds of data.
+* Within a single partition, the storage engine (say Cassandra) is responsible for keeping the segments in sort key sort order.
+* Within a single segment, for a single column, the data is grouped into columnar chunks.  Each chunk is not aware of the sort key, it just knows it has data for a single column as a binary vector.  The chunk is the smallest unit of data read or written.
 
 ### Index Writes Within a Segment
 
@@ -17,7 +17,7 @@ Implications:
 
 * All chunks for all columns within a segment must have the same number of rows.  This guarantees the `ChunkRowMap` is column-independent.  This implies:
     - The Memtable / tuple mover must fill in NA values if not all columns in a row are being written, and make sure the same number of rows are written for every column chunk
-* The `ChunkRowMap` is referenced by a `SegmentID`, within a partition, which is basically the first primary key of the primary key range of a segment
+* The `ChunkRowMap` is referenced by a `SegmentID`, within a partition, which is basically the first sort key of the sort key range of a segment
 * A single chunk of columnar data is referenced by (SegmentID, ChunkID).  The ChunkID must be the same for all chunks that are flushed from the same set of rows.  The ChunkID should be monotonically increasing with successive writes. With a single writer, using a counter would lead to more efficient reads.
 * Chunks are sorted by (SegmentID, ChunkID).  This guarantees that chunks can be read in update order, in case the `ChunkRowMap` needs to be reconstructed.
 * The `ChunkRowMap` will need to be updated every time more chunks are flushed out.  It needs to be computed at write time to keep reads fast
@@ -30,4 +30,4 @@ Implications:
 
 From time to time, all the chunks within a segment should be compacted.  After compaction, there should only be one chunk per column, with all data stored in sorted order.  In this case the `ChunkRowMap` isn't necessary anymore, leading to optimal read speeds.
 
-The primary key does not need to be stored with the `ChunkRowMap`, if the primary key is just one of the columns, since the `ChunkRowMap` can be used to read out the primary key chunks in sorted order.   What is needed is just a compacted representation of the UUID and row numbers for each PK.
+The sort key does not need to be stored with the `ChunkRowMap`, if the sort key is just one of the columns, since the `ChunkRowMap` can be used to read out the sort key chunks in sorted order.   What is needed is just a compacted representation of the UUID and row numbers for each PK.