feat: add datset viewer, remove treesitter languages that hasn't been…

… verified (TabbyML#509) * refactor: remove not verified tree sitter queries * feat(experimental): add dataset viewer update
jokemanfire · Oct 5, 2023 · 1babc38 · 1babc38
1 parent 55f68d4
commit 1babc38
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 137 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml
@@ -12,7 +12,6 @@ job_scheduler = "1.2.1"
 tabby-common = { path = "../tabby-common" }
 tantivy = { workspace = true }
 tracing = { workspace = true }
-tree-sitter-javascript = "0.20.0"
 tree-sitter-tags = "0.20.2"
 walkdir = "2.3.3"
 lazy_static = { workspace = true }
@@ -21,10 +20,6 @@ serde-jsonlines = { workspace = true }
 file-rotate = "0.7.5"
 tree-sitter-python = "0.20.2"
 tree-sitter-rust = "0.20.3"
-tree-sitter-go = "0.20.0"
-tree-sitter-java = "0.20.0"
-tree-sitter-typescript = "0.20.2"
-tree-sitter-lua = "0.0.19"
 
 [dev-dependencies]
 temp_testdir = "0.2"

diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs
@@ -248,83 +248,6 @@ lazy_static! {
                     .unwrap(),
                 ),
             ),
-            (
-                "javascript",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_javascript::language(),
-                        tree_sitter_javascript::TAGGING_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
-            (
-                "jsx",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_javascript::language(),
-                        tree_sitter_javascript::TAGGING_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
-            (
-                "typescript",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_typescript::language_typescript(),
-                        tree_sitter_typescript::TAGGING_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
-            (
-                "tsx",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_typescript::language_tsx(),
-                        tree_sitter_typescript::TAGGING_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
-            (
-                "java",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_java::language(),
-                        tree_sitter_java::TAGGING_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
-            (
-                "go",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_go::language(),
-                        tree_sitter_go::TAGGING_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
-            (
-                "lua",
-                TagsConfigurationSync(
-                    TagsConfiguration::new(
-                        tree_sitter_lua::language(),
-                        tree_sitter_lua::TAGS_QUERY,
-                        "",
-                    )
-                    .unwrap(),
-                ),
-            ),
         ])
     };
 }
diff --git a/experimental/dataset-viewer/main.py b/experimental/dataset-viewer/main.py
@@ -0,0 +1,47 @@
+import pandas as pd
+import streamlit as st
+
+# force wide mode
+st.set_page_config(layout="wide")
+
+st.write("Files")
+
+# read dataframe.
+df = pd.read_json("~/.tabby/dataset/data.jsonl", lines = True)
+
+# remove useless columns
+del df["git_url"]
+
+# filter df
+df = df[df["max_line_length"] < 200]
+df = df[df.apply(lambda x: len(x['tags']) > 0, axis=1)]
+
+selected = st.selectbox(
+   "Filename",
+   df.filepath,
+)
+
+selected_row = df[df.filepath == selected].iloc[0]
+
+def get_range(lst, x):
+    return lst[x['start']:x['end']]
+
+if selected_row is not None:
+    kinds = set([x['syntax_type_name'] for x in selected_row.tags])
+    enabled_kinds = st.multiselect("Displayed Kinds", kinds, default=kinds, key=selected_row.filepath)
+    col1, col2 = st.columns(2)
+
+    content = selected_row.content
+    with col1:
+        st.write(f"File: {selected_row.filepath}")
+        st.code(content, line_numbers=True)
+
+    with col2:
+        for tag in selected_row.tags:
+            name = get_range(content, tag['name_range'])
+            kind = tag['syntax_type_name']
+            if kind not in enabled_kinds:
+                continue
+            is_definition = '✅' if tag['is_definition'] else '❌'
+            st.markdown(f"### `{name}`\nkind: {kind}, is_definition: {is_definition}")
+            st.code(get_range(content, tag['range']))