Merge pull request hrsh7th#23 from dmitmel/synchronous-indexing

Improve reliability of async indexing while the user is editing the file, implement the memory usage optimization for the indexer, make its speed configurable
smjonas · Dec 24, 2021 · a01cfec · a01cfec
2 parents e26cdfb + eba65f6
commit a01cfec
Show file tree

Hide file tree

Showing 4 changed files with 236 additions and 78 deletions.
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ The below source configuration are available. To set any of these options, do:
 ```lua
 cmp.setup({
   sources = {
-    { 
+    {
       name = 'buffer',
       option = {
         -- Options go into this table
@@ -109,6 +109,20 @@ end
 ```
 
 
+### indexing_interval (type: number)
+
+_Default:_ `100`
+
+Advanced option. See the section [Indexing](#indexing).
+
+
+### indexing_batch_size (type: number)
+
+_Default:_ `1000`
+
+Advanced option. See the section [Indexing](#indexing).
+
+
 ## Locality bonus comparator (distance-based sorting)
 
 This source also provides a comparator function which uses information from the word indexer
@@ -133,3 +147,52 @@ cmp.setup({
   }
 })
 ```
+
+
+## Indexing
+
+When a buffer is opened, this source first has to scan all lines in the buffer, match all words
+and store all of their occurrences. This process is called _indexing_. When actually editing the
+text in the buffer, the index of words is kept up-to-date with changes to the buffer's contents,
+this is called _watching_. It is done by re-running the indexer on just the changed lines.
+Indexing happens completely asynchronously in background, unlike watching, which must be performed
+synchronously to ensure that the index of words is kept perfectly in-sync with the lines in the
+buffer. However, most of the time this will not be a problem since many typical text edit
+operations affect only one or two lines, unless you are pasting a 1000-line snippet.
+
+_Note that you can freely edit the buffer while it is being indexed_, the underlying algorithm is
+written in such a way that your changes will not break the index or cause errors. If a crash does
+happen - it is a bug, so please report it.
+
+The speed of indexing is configurable with two options: `indexing_interval` and
+`indexing_batch_size`. Essentially, when indexing, a timer is started, which pulls a batch of
+`indexing_batch_size` lines from the buffer, scans them for words, and repeats after
+`indexing_interval` milliseconds. Decreasing interval and/or increasing the batch size will make
+the indexer faster, but at the expense of higher CPU usage and more lag when editing the file
+while indexing is still in progress. Setting `indexing_batch_size` to a negative value will switch
+the indexer to the "synchronous" mode: this will process all lines in one go, take less time in
+total (since no other code will be running on the Lua thread), but with the obvious downside that
+the editor UI will be blocked.
+
+### Performance on large text files
+
+This source has been tested on code files of a few megabytes in size (5-10) and contains
+optimizations for them, however, the indexed words can still take up tens of megabytes of RAM if
+the file is large. It also currently has troubles on files with very long lines, see issue
+[#13](https://github.com/hrsh7th/cmp-buffer/issues/13).
+
+So, if you wish to avoid accidentally running this source on big files, you can tweak
+`get_bufnrs`, for example like this:
+
+```lua
+get_bufnrs = function()
+  local buf = vim.api.nvim_get_current_buf()
+  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
+  if byte_size > 1024 * 1024 then -- 1 Megabyte max
+    return {}
+  end
+  return { buf }
+end
+```
+
+Of course, this snippet can be combined with any other recipes for `get_bufnrs`.
diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
@@ -1,11 +1,18 @@
+local timer = require('cmp_buffer.timer')
+
+local function clear_table(tbl)
+  for k in pairs(tbl) do
+    tbl[k] = nil
+  end
+end
+
 ---@class cmp_buffer.Buffer
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
 ---@field public regex any
----@field public indexing_chunk_size number
----@field public indexing_interval number
----@field public timer any|nil
+---@field public timer cmp_buffer.Timer
 ---@field public lines_count number
+---@field public timer_current_line number
 ---@field public lines_words table<number, string[]>
 ---@field public unique_words_curr_line table<string, boolean>
 ---@field public unique_words_other_lines table<string, boolean>
@@ -20,6 +27,10 @@
 ---@field public words_distances_dirty boolean
 local buffer = {}
 
+-- For some reason requesting this much lines multiple times in chunks leads to
+-- much better memory usage than fetching the entire file in one go.
+buffer.GET_LINES_CHUNK_SIZE = 1000
+
 ---Create new buffer object
 ---@param bufnr number
 ---@param opts cmp_buffer.Options
@@ -28,16 +39,15 @@ function buffer.new(bufnr, opts)
   local self = setmetatable({}, { __index = buffer })
 
   self.bufnr = bufnr
-  self.timer = nil
+  self.timer = timer.new()
   self.closed = false
   self.on_close_cb = nil
 
   self.opts = opts
   self.regex = vim.regex(self.opts.keyword_pattern)
-  self.indexing_chunk_size = 1000
-  self.indexing_interval = 200
 
   self.lines_count = 0
+  self.timer_current_line = -1
   self.lines_words = {}
 
   self.unique_words_curr_line = {}
@@ -58,8 +68,11 @@ end
 function buffer.close(self)
   self.closed = true
   self:stop_indexing_timer()
+  self.timer:close()
+  self.timer = nil
 
   self.lines_count = 0
+  self.timer_current_line = -1
   self.lines_words = {}
 
   self.unique_words_curr_line = {}
@@ -79,28 +92,16 @@ function buffer.close(self)
 end
 
 function buffer.stop_indexing_timer(self)
-  if self.timer and not self.timer:is_closing() then
-    self.timer:stop()
-    self.timer:close()
-  end
-  self.timer = nil
+  self.timer:stop()
+  self.timer_current_line = -1
 end
 
 function buffer.mark_all_lines_dirty(self)
   self.unique_words_curr_line_dirty = true
   self.unique_words_other_lines_dirty = true
   self.last_edit_first_line = 0
   self.last_edit_last_line = 0
-end
-
----Indexing buffer
-function buffer.index(self)
-  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-  for i = 1, self.lines_count do
-    self.lines_words[i] = {}
-  end
-
-  self:index_range_async(0, self.lines_count)
+  self.words_distances_dirty = true
 end
 
 --- Workaround for https://github.com/neovim/neovim/issues/16729
@@ -112,48 +113,67 @@ function buffer.safe_buf_call(self, callback)
   end
 end
 
-function buffer.index_range(self, range_start, range_end)
+function buffer.index_range(self, range_start, range_end, skip_already_indexed)
   self:safe_buf_call(function()
-    local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
-    for i, line in ipairs(lines) do
-      self:index_line(range_start + i, line)
+    local chunk_size = self.GET_LINES_CHUNK_SIZE
+    local chunk_start = range_start
+    while chunk_start < range_end do
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
+      local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
+      for i, line in ipairs(chunk_lines) do
+        if not skip_already_indexed or not self.lines_words[chunk_start + i] then
+          self:index_line(chunk_start + i, line)
+        end
+      end
+      chunk_start = chunk_end
     end
   end)
 end
 
-function buffer.index_range_async(self, range_start, range_end)
-  local chunk_start = range_start
-
-  local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
+function buffer.start_indexing_timer(self)
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+  self.timer_current_line = 0
+
+  -- Negative values result in an integer overflow in luv (vim.loop), and zero
+  -- disables timer repeat, so only intervals larger than 1 are valid.
+  local interval = math.max(1, self.opts.indexing_interval)
+  self.timer:start(0, interval, function()
+    if self.closed then
+      self:stop_indexing_timer()
+      return
+    end
 
-  self.timer = vim.loop.new_timer()
-  self.timer:start(
-    0,
-    self.indexing_interval,
-    vim.schedule_wrap(function()
-      if self.closed then
-        return
-      end
+    -- Note that the async indexer is designed to not break even if the user is
+    -- editing the file while it is in the process of being indexed. Because
+    -- the indexing in watcher must use the synchronous algorithm, we assume
+    -- that the data already present in self.lines_words to be correct and
+    -- doesn't need refreshing here because even if we do receive text from
+    -- nvim_buf_get_lines different from what the watcher has seen so far, it
+    -- (the watcher) will catch up on the next on_lines event.
+
+    -- Skip over the already indexed lines
+    while self.lines_words[self.timer_current_line + 1] do
+      self.timer_current_line = self.timer_current_line + 1
+    end
 
-      local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end)
-      self:safe_buf_call(function()
-        for linenr = chunk_start + 1, chunk_end do
-          self:index_line(linenr, lines[linenr])
-        end
-      end)
-      chunk_start = chunk_end
-      self:mark_all_lines_dirty()
-      self.words_distances_dirty = true
+    local batch_start = self.timer_current_line
+    local batch_size = self.opts.indexing_batch_size
+    -- NOTE: self.lines_count may be modified by the indexer.
+    local batch_end = batch_size >= 1 and math.min(batch_start + batch_size, self.lines_count) or self.lines_count
+    if batch_end >= self.lines_count then
+      self:stop_indexing_timer()
+    end
+    self.timer_current_line = batch_end
+    self:mark_all_lines_dirty()
 
-      if chunk_end >= range_end then
-        self:stop_indexing_timer()
-      end
-    end)
-  )
+    self:index_range(batch_start, batch_end, true)
+  end)
 end
 
 --- watch
 function buffer.watch(self)
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+
   -- NOTE: As far as I know, indexing in watching can't be done asynchronously
   -- because even built-in commands generate multiple consequent `on_lines`
   -- events, and I'm not even mentioning plugins here. To get accurate results
@@ -218,8 +238,24 @@ function buffer.watch(self)
       end
       self.lines_count = new_lines_count
 
-      -- replace lines
-      self:index_range(first_line, new_last_line)
+      -- This branch is support code for handling cases when the user is
+      -- editing the buffer while the async indexer is running. It solves the
+      -- problem that if new lines are inserted or old lines are deleted, the
+      -- indexes of each subsequent line will change, and so the indexer
+      -- current position must be adjusted to not accidentally skip any lines.
+      if self.timer:is_active() then
+        if first_line <= self.timer_current_line and self.timer_current_line < old_last_line then
+          -- The indexer was in the area of the current text edit. We will
+          -- synchronously index this area it in a moment, so the indexer
+          -- should resume from right after the edit range.
+          self.timer_current_line = new_last_line
+        elseif self.timer_current_line >= old_last_line then
+          -- The indexer was somewhere past the current text edit. This means
+          -- that the line numbers could have changed, and the indexing
+          -- position must be adjusted accordingly.
+          self.timer_current_line = self.timer_current_line + delta
+        end
+      end
 
       if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
         self.unique_words_curr_line_dirty = true
@@ -231,30 +267,20 @@ function buffer.watch(self)
       self.last_edit_last_line = new_last_line
 
       self.words_distances_dirty = true
+
+      -- replace lines
+      self:index_range(first_line, new_last_line)
     end,
 
     on_reload = function(_, _)
       if self.closed then
         return true
       end
 
-      -- The logic for adjusting lines list on buffer reloads is much simpler
-      -- because tables of all lines can be assumed to be fresh.
-      local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-      if new_lines_count > self.lines_count then -- append
-        for i = self.lines_count + 1, new_lines_count do
-          self.lines_words[i] = {}
-        end
-      elseif new_lines_count < self.lines_count then -- remove
-        for i = self.lines_count, new_lines_count + 1, -1 do
-          self.lines_words[i] = nil
-        end
-      end
-      self.lines_count = new_lines_count
+      clear_table(self.lines_words)
 
-      self:index_range(0, self.lines_count)
-      self:mark_all_lines_dirty()
-      self.words_distances_dirty = true
+      self:stop_indexing_timer()
+      self:start_indexing_timer()
     end,
 
     on_detach = function(_, _)
@@ -266,12 +292,6 @@ function buffer.watch(self)
   })
 end
 
-local function clear_table(tbl)
-  for k in pairs(tbl) do
-    tbl[k] = nil
-  end
-end
-
 ---@param linenr number
 ---@param line string
 function buffer.index_line(self, linenr, line)