21 Jul 2010 merge

* Use Flaptor's Indextank product for search, including santip's patch * for their new API * Add Cassandra ConsistencyLevels to the ini file, and storage-conf.xml * to the public repo * Patch contributed by umbrae in ticket reddit-archive#929: Add jumpToContent support * for Keyboard Accessibility * reddit gold - paypal/postcard support - friends with benefits - profile-page sorting for gold members - move domain listings into the permacache
kbrower · Jul 22, 2010 · 0ae8f2f · 0ae8f2f
1 parent 52da322
commit 0ae8f2f
Show file tree

Hide file tree

Showing 70 changed files with 2,802 additions and 2,927 deletions.
diff --git a/config/cassandra/storage-conf.xml b/config/cassandra/storage-conf.xml
@@ -0,0 +1,216 @@
+<Storage>
+  <!--======================================================================-->
+  <!-- Basic Configuration                                                  -->
+  <!--======================================================================-->
+
+  <ClusterName>reddit</ClusterName>
+
+  <AutoBootstrap>false</AutoBootstrap>
+  <HintedHandoffEnabled>true</HintedHandoffEnabled>
+
+  <Keyspaces>
+    <Keyspace Name="permacache">
+      <ColumnFamily CompareWith="BytesType" Name="permacache" RowsCached="3000000" />
+
+      <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
+      <ReplicationFactor>3</ReplicationFactor>
+      <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
+    </Keyspace>
+
+    <Keyspace Name="urls">
+      <ColumnFamily CompareWith="UTF8Type" Name="urls" />
+
+      <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
+      <ReplicationFactor>3</ReplicationFactor>
+      <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
+    </Keyspace>
+
+    <Keyspace Name="reddit">
+      <!-- Relations -->
+      <ColumnFamily CompareWith="UTF8Type" Name="LinkVote" />
+      <ColumnFamily CompareWith="UTF8Type" Name="CommentVote" />
+
+      <!-- Views -->
+      <ColumnFamily CompareWith="UTF8Type" Name="VotesByLink" />
+
+      <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
+      <ReplicationFactor>3</ReplicationFactor>
+      <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
+    </Keyspace>
+
+  </Keyspaces>
+
+  <Authenticator>org.apache.cassandra.auth.AllowAllAuthenticator</Authenticator>
+
+  <Partitioner>org.apache.cassandra.dht.RandomPartitioner</Partitioner>
+
+  <InitialToken></InitialToken>
+
+  <CommitLogDirectory>/cassandra/commitlog</CommitLogDirectory>
+  <DataFileDirectories>
+      <DataFileDirectory>/cassandra/data</DataFileDirectory>
+  </DataFileDirectories>
+
+  <Seeds>
+      <Seed>pmc01</Seed>
+      <Seed>pmc02</Seed>
+      <Seed>pmc03</Seed>
+      <Seed>pmc06</Seed>
+      <Seed>pmc07</Seed>
+      <Seed>pmc08</Seed>
+  </Seeds>
+
+  <!-- Miscellaneous -->
+
+  <!-- Time to wait for a reply from other nodes before failing the command -->
+  <RpcTimeoutInMillis>30000</RpcTimeoutInMillis>
+  <!-- phi value that must be reached before a host is marked as down.
+       most users should never adjust this -->
+  <PhiConvictThreshold>10</PhiConvictThreshold>
+  <!-- Size to allow commitlog to grow to before creating a new segment -->
+  <CommitLogRotationThresholdInMB>128</CommitLogRotationThresholdInMB>
+
+  <!-- Local hosts and ports -->
+
+  <ListenAddress></ListenAddress>
+  <!-- internal communications port -->
+  <StoragePort>7000</StoragePort>
+
+  <ThriftAddress></ThriftAddress>
+  <!-- Thrift RPC port (the port clients connect to). -->
+  <ThriftPort>9160</ThriftPort>
+
+  <ThriftFramedTransport>false</ThriftFramedTransport>
+
+
+  <!--======================================================================-->
+  <!-- Memory, Disk, and Performance                                        -->
+  <!--======================================================================-->
+
+  <!--
+   ~ Access mode.  mmapped i/o is substantially faster, but only practical on
+   ~ a 64bit machine (which notably does not include EC2 "small" instances)
+   ~ or relatively small datasets.  "auto", the safe choice, will enable
+   ~ mmapping on a 64bit JVM.  Other values are "mmap", "mmap_index_only"
+   ~ (which may allow you to get part of the benefits of mmap on a 32bit
+   ~ machine by mmapping only index files) and "standard".
+   ~ (The buffer size settings that follow only apply to standard,
+   ~ non-mmapped i/o.)
+   -->
+  <DiskAccessMode>mmap_index_only</DiskAccessMode>
+
+  <!--
+   ~ Size of compacted row above which to log a warning.  (If compacted
+   ~ rows do not fit in memory, Cassandra will crash.  This is explained
+   ~ in http://wiki.apache.org/cassandra/CassandraLimitations and is
+   ~ scheduled to be fixed in 0.7.)
+  -->
+  <RowWarningThresholdInMB>512</RowWarningThresholdInMB>
+
+  <!--
+   ~ Buffer size to use when performing contiguous column slices. Increase
+   ~ this to the size of the column slices you typically perform. 
+   ~ (Name-based queries are performed with a buffer size of 
+   ~ ColumnIndexSizeInKB.)
+  -->
+  <SlicedBufferSizeInKB>64</SlicedBufferSizeInKB>
+
+  <!--
+   ~ Buffer size to use when flushing memtables to disk. (Only one 
+   ~ memtable is ever flushed at a time.) Increase (decrease) the index
+   ~ buffer size relative to the data buffer if you have few (many) 
+   ~ columns per key.  Bigger is only better _if_ your memtables get large
+   ~ enough to use the space. (Check in your data directory after your
+   ~ app has been running long enough.) -->
+  <FlushDataBufferSizeInMB>32</FlushDataBufferSizeInMB>
+  <FlushIndexBufferSizeInMB>8</FlushIndexBufferSizeInMB>
+
+  <!--
+   ~ Add column indexes to a row after its contents reach this size.
+   ~ Increase if your column values are large, or if you have a very large
+   ~ number of columns.  The competing causes are, Cassandra has to
+   ~ deserialize this much of the row to read a single column, so you want
+   ~ it to be small - at least if you do many partial-row reads - but all
+   ~ the index data is read for each access, so you don't want to generate
+   ~ that wastefully either.
+  -->
+  <ColumnIndexSizeInKB>64</ColumnIndexSizeInKB>
+
+  <!--
+   ~ Flush memtable after this much data has been inserted, including
+   ~ overwritten data.  There is one memtable per column family, and 
+   ~ this threshold is based solely on the amount of data stored, not
+   ~ actual heap memory usage (there is some overhead in indexing the
+   ~ columns).
+  -->
+  <MemtableThroughputInMB>64</MemtableThroughputInMB>
+  <!--
+   ~ Throughput setting for Binary Memtables.  Typically these are
+   ~ used for bulk load so you want them to be larger.
+  -->
+  <BinaryMemtableThroughputInMB>256</BinaryMemtableThroughputInMB>
+  <!--
+   ~ The maximum number of columns in millions to store in memory per
+   ~ ColumnFamily before flushing to disk.  This is also a per-memtable
+   ~ setting.  Use with MemtableThroughputInMB to tune memory usage.
+  -->
+  <MemtableOperationsInMillions>0.3</MemtableOperationsInMillions>
+  <!--
+   ~ The maximum time to leave a dirty memtable unflushed.
+   ~ (While any affected columnfamilies have unflushed data from a
+   ~ commit log segment, that segment cannot be deleted.)
+   ~ This needs to be large enough that it won't cause a flush storm
+   ~ of all your memtables flushing at once because none has hit
+   ~ the size or count thresholds yet.  For production, a larger
+   ~ value such as 1440 is recommended.
+  -->
+  <MemtableFlushAfterMinutes>60</MemtableFlushAfterMinutes>
+
+  <!--
+   ~ Unlike most systems, in Cassandra writes are faster than reads, so
+   ~ you can afford more of those in parallel.  A good rule of thumb is 2
+   ~ concurrent reads per processor core.  Increase ConcurrentWrites to
+   ~ the number of clients writing at once if you enable CommitLogSync +
+   ~ CommitLogSyncDelay. -->
+  <ConcurrentReads>8</ConcurrentReads>
+  <ConcurrentWrites>32</ConcurrentWrites>
+
+  <!--
+   ~ CommitLogSync may be either "periodic" or "batch."  When in batch
+   ~ mode, Cassandra won't ack writes until the commit log has been
+   ~ fsynced to disk.  It will wait up to CommitLogSyncBatchWindowInMS
+   ~ milliseconds for other writes, before performing the sync.
+
+   ~ This is less necessary in Cassandra than in traditional databases
+   ~ since replication reduces the odds of losing data from a failure
+   ~ after writing the log entry but before it actually reaches the disk.
+   ~ So the other option is "periodic," where writes may be acked immediately
+   ~ and the CommitLog is simply synced every CommitLogSyncPeriodInMS
+   ~ milliseconds.
+  -->
+  <CommitLogSync>periodic</CommitLogSync>
+  <!--
+   ~ Interval at which to perform syncs of the CommitLog in periodic mode.
+   ~ Usually the default of 10000ms is fine; increase it if your i/o
+   ~ load is such that syncs are taking excessively long times.
+  -->
+  <CommitLogSyncPeriodInMS>10000</CommitLogSyncPeriodInMS>
+  <!--
+   ~ Delay (in milliseconds) during which additional commit log entries
+   ~ may be written before fsync in batch mode.  This will increase
+   ~ latency slightly, but can vastly improve throughput where there are
+   ~ many writers.  Set to zero to disable (each entry will be synced
+   ~ individually).  Reasonable values range from a minimal 0.1 to 10 or
+   ~ even more if throughput matters more than latency.
+  -->
+  <!-- <CommitLogSyncBatchWindowInMS>1</CommitLogSyncBatchWindowInMS> --> 
+
+  <!--
+   ~ Time to wait before garbage-collection deletion markers.  Set this to
+   ~ a large enough value that you are confident that the deletion marker
+   ~ will be propagated to all replicas by the time this many seconds has
+   ~ elapsed, even in the face of hardware failures.  The default value is
+   ~ ten days.
+  -->
+  <GCGraceSeconds>864000</GCGraceSeconds>
+</Storage>
diff --git a/r2/draw_load.py b/r2/draw_load.py
@@ -59,6 +59,11 @@ def draw_box(label, color, center = False):
             draw_box("  %s load: %s" % (host.host, host.load()),
                      get_load_level(host))
 
+    draw_box(" ==== MEDIA ==== ", "#BBBBBB", center = True)
+    for host in hosts:
+        if host.host.startswith('media'):
+            draw_box("  %s load: %s" % (host.host, host.load()),
+                     get_load_level(host))
     draw_box(" ==== SEARCH ==== ", "#BBBBBB", center = True)
     for host in hosts:
         if host.host.startswith('search'):

diff --git a/r2/example.ini b/r2/example.ini
@@ -92,6 +92,9 @@ servicecaches = 127.0.0.1:11211
 permacache_memcaches = 127.0.0.1:11211
 # cassandra hosts. one of these will be chosen at random by pycassa
 cassandra_seeds = 127.0.0.1:9160
+# read/write consistency levels for Cassandra
+cassandra_rcl = ONE
+cassandra_wcl = QUORUM
 
 # -- url cache options --
 url_caches = 127.0.0.1:11211
@@ -285,6 +288,8 @@ MIN_UP_KARMA = 1
 MIN_RATE_LIMIT_KARMA = 10
 MIN_RATE_LIMIT_COMMENT_KARMA = 1
 QUOTA_THRESHOLD = 5
+# Links and comments older than this many days qualify for historic preservation
+REPLY_AGE_LIMIT = 180
 
 # min amount of karma to edit 
 WIKI_KARMA = 100
@@ -302,6 +307,8 @@ num_comments = 200
 max_comments = 500
 # list of reddits to auto-subscribe users to
 automatic_reddits = 
+# special reddit that only reddit gold subscribers can use
+lounge_reddit =
 # cutoff number of reddits to show unsubscribed users
 num_default_reddits = 10
 # how deep do we go into the top listing when fetching /random
@@ -338,5 +345,5 @@ beaker.session_secret = somesecret
 # WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*
 # Debug mode will enable the interactive debugging tool, allowing ANYONE to
 # execute malicious code after an exception is raised.
-set debug = true
+#set debug = false
 
diff --git a/r2/r2/config/routing.py b/r2/r2/config/routing.py
@@ -169,6 +169,9 @@ def make_map(global_conf={}, app_conf={}):
     mc('/message/moderator/:subwhere', controller='message', action='listing',
        where = 'moderator')
 
+    mc('/thanks', controller='forms', action="thanks", secret = '')
+    mc('/thanks/:secret', controller='forms', action="thanks")
+
     mc('/password', controller='forms', action="password")
     mc('/:action', controller='front',
        requirements=dict(action="random|framebuster|selfserviceoatmeal"))
@@ -202,6 +205,7 @@ def make_map(global_conf={}, app_conf={}):
        requirements=dict(action="options|over18|unlogged_options|optout|optin|login|reg"))
 
     mc('/api/distinguish/:how', controller='api', action="distinguish")
+    mc('/api/ipn/:secret', controller='api', action='ipn')
     mc('/api/:action/:url_user', controller='api',
        requirements=dict(action="login|register"))
     mc('/api/gadget/click/:ids', controller = 'api', action='gadget', type='click')