Version 0.5.4

abcache: Speed up AbCacheFilesystem `ls()`
mos9527 · Nov 22, 2024 · 946ab9c · 946ab9c
1 parent 7959355
commit 946ab9c
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 33 deletions.
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
     version=sssekai.__version__,
     author="greats3an",
     author_email="[email protected]",
-    description="Project SEKAI Asset Utility / PJSK 资源下载 + Live2D, Spine, USM 提取",
+    description="Project SEKAI Asset Utility / PJSK 资源工具",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/mos9527/sssekai",

diff --git a/sssekai/__init__.py b/sssekai/__init__.py
@@ -1,5 +1,5 @@
 __VERSION_MAJOR__ = 0
 __VERSION_MINOR__ = 5
-__VERSION_PATCH__ = 3
+__VERSION_PATCH__ = 4
 
 __version__ = "%s.%s.%s" % (__VERSION_MAJOR__, __VERSION_MINOR__, __VERSION_PATCH__)
diff --git a/sssekai/__main__.py b/sssekai/__main__.py
@@ -28,7 +28,7 @@ def write(__s):
                 return sys.stdout.write(__s)
 
     parser = argparse.ArgumentParser(
-        description="""SSSekai Proejct SEKAI feat. Hatsune Miku (Android) Asset Utility""",
+        description="""Project SEKAI Asset Utility / PJSK 资源工具""",
         formatter_class=argparse.RawTextHelpFormatter,
     )
     parser.add_argument(

diff --git a/sssekai/abcache/fs.py b/sssekai/abcache/fs.py
@@ -1,6 +1,6 @@
 import math, fsspec
 from typing import Callable
-from functools import cached_property
+from functools import cached_property, cache
 from collections import defaultdict
 from fsspec.spec import AbstractBufferedFile
 from fsspec.caching import BaseCache, register_cache
@@ -68,7 +68,7 @@ class AbCacheFile(AbstractBufferedFile):
         - Seeks are simulated by read-aheads (by UnidirectionalBlockCache). Meaning seek operations
           will incur additional download (in-betweens will be cached as well).
     """
-
+    DEFAULT_BLOCK_SIZE = 65536 # 64KB
     entry: AbCacheEntry
 
     @property
@@ -81,12 +81,13 @@ def entry(self) -> AbCacheEntry:
         assert entry is not None, "entry not found"
         return entry
 
-    def __init__(self, fs, bundle: str):
+    def __init__(self, fs, bundle: str, block_size=None):
         self.fs, self.path = fs, bundle
         self.fetch_loc = 0
         super().__init__(
             fs,
             bundle,
+            block_size=block_size or self.DEFAULT_BLOCK_SIZE,
             mode="rb",
             cache_type="unidirectional_blockcache",
             size=self.entry.fileSize,
@@ -104,7 +105,7 @@ def __innner():
             for block in decrypt_iter(
                 lambda nbytes: next(self.__resp.iter_content(nbytes)), self.blocksize
             ):
-                yield block
+                yield bytes(block)
 
         return __innner()
 
@@ -117,7 +118,7 @@ def _fetch_range(self, start, end):
 # Reference: https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/libarchive.py
 class AbCacheFilesystem(AbstractArchiveFileSystem):
     """Filesystem for reading from an AbCache on demand."""
-
+    root_marker = "/"
     protocol = "abcache"
     cache: AbCache
 
@@ -142,25 +143,81 @@ def __init__(self, fo: str = "", cache_obj: AbCache = None, *args, **kwargs):
 
     @cached_property
     def dir_cache(self):
-        cache = defaultdict(dict)
-        for path, bundle in self.cache.abcache_index.bundles.items():
-            path = "/" + path
-            cache.update(
-                {
-                    dirname: {"name": dirname, "size": 0, "type": "directory"}
-                    for dirname in self._all_dirnames([path])
-                }
-            )
-            cache[path] = {
-                "name": path,
-                "size": bundle.fileSize,
-                "type": "file",
-            }
-        return cache
+        # Reference implementation did O(n) per *every* ls() call
+        # We can make it O(1) with DP on tree preprocessing of O(nlogn)
+        bundles = self.cache.abcache_index.bundles     
+        # Only the leaf nodes are given.     
+        keys = set((self.root_marker + key for key in bundles.keys()))
+        keys |= self._all_dirnames(bundles.keys())
+        # Sorting implies DFS order.
+        keys = [self.root_marker] + [key for key in sorted(keys)]
+        _trim = lambda key: key[len(self.root_marker):]
+        nodes = [{
+            "name": key, 
+            "type": "directory" if not _trim(key) in bundles else "file",
+            "size": 0 if not _trim(key) in bundles else bundles[_trim(key)].fileSize,
+            "item_count": 0,
+            "file_count": 0,
+            "total_size": 0
+        } for key in keys]
+        # Already in DFS order.
+        # Get start index for each directory and their item count.
+        stack = [0]        
+        graph = defaultdict(list)
+        table = {node['name']: index for index, node in enumerate(nodes)}
+        def is_file(name):
+            return _trim(name) in bundles
+        def is_parent_path(a, b):
+            # a is parent of b
+            if a == self.root_marker: return True
+            return b.startswith(a + self.root_marker)           
+        def maintain():
+            # Always starts from root. Safe to assume stack size >= 2
+            u,v = stack[-2], stack[-1]
+            nodes[u]["item_count"] += nodes[v]["item_count"]              
+            nodes[u]["file_count"] += nodes[v]["file_count"]          
+            nodes[u]["total_size"] += nodes[v]["total_size"]    
+            stack.pop()
+        for index, name in enumerate(keys):
+            # Skip root
+            if index == 0:
+                continue
+            while not is_parent_path(keys[stack[-1]], name):
+                maintain()
+            pa = stack[-1]
+            nodes[pa]["item_count"] += 1
+            graph[pa].append(index)
+            if not is_file(name):             
+                stack.append(index)
+            else:
+                nodes[pa]["file_count"] += 1
+                nodes[pa]["total_size"] += nodes[index]["size"]
+                nodes[index]["total_size"] = nodes[index]["size"]
+        while len(stack) >= 2:
+            maintain()
+        assert nodes[0]['file_count'] == len(bundles), "file count mismatch"
+        return nodes, graph, table
 
     def _get_dirs(self):
         return self.dir_cache
 
+    def info(self, path, **kwargs):
+        nodes, graph, table = self._get_dirs()
+        path = path or self.root_marker
+        if path in table:
+            return nodes[table[path]]
+        else:
+            raise FileNotFoundError(path)
+
+    @cache
+    def ls(self, path, detail=True, **kwargs):
+        nodes, graph, table = self._get_dirs()
+        path = path or self.root_marker
+        if path in table:
+            u = table[path]
+            return [nodes[v] if detail else nodes[v]['name'] for v in graph[u]]
+        return []
+
     def open(self, path, mode="rb"):
         assert mode == "rb", "only binary read-only mode is supported"
         return AbCacheFile(self, path)

diff --git a/sssekai/entrypoint/abserve.py b/sssekai/entrypoint/abserve.py
@@ -1,4 +1,4 @@
-import os, logging
+import os, logging, datetime, time, sys
 from shutil import copyfileobj
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from sssekai import __version__
@@ -13,7 +13,9 @@ class AbServeHTTPRequestHandler(BaseHTTPRequestHandler):
     ENCODING = "utf-8"
 
     def format_listing(self, path):
+        t0 = time.time()
         r = []
+        path = path or "/"
         title = f"Directory listing for {path}"
         r = (
             f"<!DOCTYPE HTML>"
@@ -23,13 +25,15 @@ def format_listing(self, path):
             f"<style>"
             f"body {{ font-family: monospace; }}"
             f"body {{ background-color: black; color: white; }}"
-            f"a {{ color: lightblue; }}"
+            f"a,i {{ color: lightblue; }}"
             f"</style>"
             f"<title>{title}</title>"
             f"</head>"
             f"<body><h1>{title}</h1>"
-            f"<hr><ul>"
-            f'<li><a href="..">..</a></li>'
+            f"<i>children: {len(fs.ls(path))},</i>"            
+            f"<i>total number of files: {fs.info(path)['file_count']},</i>"
+            f"<i>total size: {filesize.decimal(fs.info(path)['total_size'])}</i><br>"                
+            f'<hr><ul><li><a href="..">..</a></li>'
         )
         for entry in sorted(
             fs.listdir(path), key=lambda x: (x["type"], x["name"], x["size"])
@@ -40,14 +44,16 @@ def format_listing(self, path):
             nodename = name.split("/")[-1]
             linkname = name
             displayname = nodename
+            extra_tags = ' '.join([f'{k}="{v}"' for k,v in entry.items()])
             if fs.isdir(name):
-                linkname += "/"
+                linkname += "/"          
             else:
-                fsize = filesize.decimal(entry["size"])
-                displayname += f" ({fsize})"
-            r += '<li><a href="%s">%s</a></li>' % (linkname, displayname)
-        r += "</ul><hr>"
-        r += "<i>sssekai v%s, %s</i>" % (__version__, fs.cache)
+                displayname += f" ({filesize.decimal(entry['size'])})"                  
+            r += f'<li><a {extra_tags} href="{linkname}">{displayname}</a></li>'
+        r += "</ul><hr>"       
+        r += f"<i>sssekai v{__version__} running on Python {sys.version}</i><br>"
+        r += f"<i>{fs.cache}</i><br>"
+        r += "<i>page rendered in %.3fms, server time: %s</i>" % ((time.time() - t0)*1000, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
         r += "</body></html>"
         encoded = r.encode(self.ENCODING, "surrogateescape")
         return encoded