Changes to make multiple language support easier.

albertmcma · Jul 27, 2011 · 8065f64 · 8065f64
1 parent 3376d1e
commit 8065f64
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 79 deletions.
diff --git a/dxr-index.py b/dxr-index.py
@@ -3,15 +3,16 @@
 from multiprocessing import cpu_count
 from multiprocessing.pool import ThreadPool as Pool
 from itertools import chain
-import os
-import sys
-import getopt
-import subprocess
+import dxr
 import dxr.htmlbuilders
+import dxr.languages
+import getopt
+import os
 import shutil
-import dxr
 import sqlite3
 import string
+import subprocess
+import sys
 import time
 
 # At this point in time, we've already compiled the entire build, so it is time
@@ -174,23 +175,35 @@ def builddb(treecfg, dbdir):
   print "Storing data..."
   dxr.store_big_blob(treecfg, big_blob)
 
+  # Build the sql for later queries. This is a combination of the main language
+  # schema as well as plugin-specific information. The pragmas that are
+  # executed should make the sql stage go faster.
   print "Building SQL..."
-  all_statements = []
-  schemata = []
-  for plugin in dxr.get_active_plugins(treecfg):
-    schemata.append(plugin.get_schema())
-    if plugin.__name__ in big_blob:
-      all_statements.extend(plugin.sqlify(big_blob[plugin.__name__]))
-
   dbname = treecfg.tree + '.sqlite'
   conn = sqlite3.connect(os.path.join(dbdir, dbname))
   conn.execute('PRAGMA synchronous=off')
   conn.execute('PRAGMA page_size=65536')
   # Safeguard against non-ASCII text. Let's just hope everyone uses UTF-8
   conn.text_factory = str
+
+  # Import the schemata
+  schemata = [dxr.languages.get_standard_schema()]
+  for plugin in dxr.get_active_plugins(treecfg):
+    schemata.append(plugin.get_schema())
   conn.executescript('\n'.join(schemata))
   conn.commit()
-  for stmt in all_statements:
+
+  # Load and run the SQL
+  def sql_generator():
+    for plugin in dxr.get_active_plugins(treecfg):
+      if plugin.__name__ in big_blob:
+        plugblob = big_blob[plugin.__name__]
+        for statement in plugin.sqlify(plugblob):
+          yield statement
+        for statement in dxr.languages.get_sql_statements("native", plugblob):
+          yield statement
+
+  for stmt in sql_generator():
     if isinstance(stmt, tuple):
       conn.execute(stmt[0], stmt[1])
     else:

diff --git a/dxr/languages.py b/dxr/languages.py
@@ -0,0 +1,77 @@
+import dxr.plugins
+
+# The following schema is the common global schema, so no matter which plugins
+# are used, this schema will always be present. Most tables have a language
+# column which indicates the source language that the type is written in.
+language_schema = dxr.plugins.Schema({
+  # Scope definitions: a scope is anything that is both interesting (i.e., not
+  # a namespace) and can contain other objects. The IDs for this scope should be
+  # IDs in other tables as well; the table its in can disambiguate which type of
+  # scope you're looking at.
+  "scopes": [
+    ("scopeid", "INTEGER", False),    # An ID for this scope
+    ("sname", "VARCHAR(256)", True),  # Name of the scope
+    ("sloc", "_location", True),      # Location of the canonical decl
+    ("language", "_language", False), # The language of the scope
+    ("_key", "scopeid")
+  ],
+  # Type definitions: anything that defines a type per the relevant specs.
+  "types": [
+    ("tid", "INTEGER", False),            # Unique ID for the type
+    ("scopeid", "INTEGER", False),        # Scope this type is defined in
+    ("tname", "VARCHAR(256)", False),     # Simple name of the type
+    ("tqualname", "VARCHAR(256)", False), # Fully-qualified name of the type
+    ("tloc", "_location", False),         # Location of canonical decl
+    ("tkind", "VARCHAR(32)", True),       # Kind of type (e.g., class, union)
+    ("language", "_language", False),     # Language of the type
+    ("_key", "tid")
+  ],
+  # Inheritance relations: note that we store the full transitive closure in
+  # this table, so if A extends B and B extends C, we'd have (A, C) stored in
+  # the table as well; this is necessary to make SQL queries work, since there's
+  # no "transitive closure lookup expression".
+  "impl": [
+    ("tbase", "INTEGER", False),      # tid of base type
+    ("tderived", "INTEGER", False),   # tid of derived type
+    ("inhtype", "VARCHAR(32)", True), # Type of inheritance; NULL is indirect
+    ("_key", "tbase", "tderived")
+  ],
+  # Functions: functions, methods, constructors, operator overloads, etc.
+  "functions": [
+    ("funcid", "INTEGER", False),         # Function ID (also in scopes)
+    ("scopeid", "INTEGER", False),        # Scope defined in
+    ("fname", "VARCHAR(256)", False),     # Short name (no args)
+    ("fqualname", "VARCHAR(512)", False), # Fully qualified name, excluding args
+    ("fargs", "VARCHAR(256)", False),     # Argument string, including parens
+    ("ftype", "VARCHAR(256)", False),     # Full return type, as a string
+    ("floc", "_location", True),          # Location of definition
+    ("modifiers", "VARCHAR(256)", True),  # Modifiers (e.g., private)
+    ("language", "_language", False),     # Language of the function
+    ("_key", "funcid")
+  ],
+  # Variables: class, global, local, enum constants; they're all in here
+  # Variables are of course not scopes, but for ease of use, they use IDs from
+  # the same namespace, no scope will have the same ID as a variable and v.v.
+  "variables": [
+    ("varid", "INTEGER", False),         # Variable ID
+    ("scopeid", "INTEGER", False),       # Scope defined in
+    ("vname", "VARCHAR(256)", False),    # Short name
+    ("vloc", "_location", True),         # Location of definition
+    ("vtype", "VARCHAR(256)", True),     # Full type (including pointer stuff)
+    ("modifiers", "VARCHAR(256)", True), # Modifiers for the declaration
+    ("language", "_language", False),    # Language of the function
+    ("_key", "varid")
+  ],
+  "crosslang": [
+    ("canonid", "INTEGER", False),
+    ("otherid", "INTEGER", False),
+    ("otherlanguage", "VARCHAR(32)", False),
+  ],
+})
+
+
+def get_standard_schema():
+  return language_schema.get_create_sql()
+
+def get_sql_statements(lang_name, plugin_blob):
+  return language_schema.get_data_sql(plugin_blob, lang_name)
diff --git a/dxr/plugins.py b/dxr/plugins.py
@@ -98,11 +98,11 @@ def get_create_sql(self):
     """ Returns the SQL that creates the tables in this schema. """
     return '\n'.join([tbl.get_create_sql() for tbl in self.tables.itervalues()])
 
-  def get_data_sql(self, blob):
+  def get_data_sql(self, blob, language=''):
     """ Returns the SQL that inserts data into tables given a blob. """
     for tbl in self.tables:
       if tbl in blob:
-        sqliter = self.tables[tbl].get_data_sql(blob[tbl])
+        sqliter = self.tables[tbl].get_data_sql(blob[tbl], language)
         for sql in sqliter:
           yield sql
 
@@ -132,6 +132,7 @@ def __init__(self, tblname, tblschema):
     self.name = tblname
     self.key = None
     self.columns = []
+    self.needLang = False
     defaults = ['VARCHAR(256)', True]
     for col in tblschema:
       if isinstance(tblschema, tuple) or isinstance(tblschema, list):
@@ -154,12 +155,15 @@ def get_create_sql(self):
     sql += 'CREATE TABLE %s (\n  ' % (self.name)
     colstrs = []
     special_types = {
-      '_location': 'VARCHAR(256)'
+      '_location': 'VARCHAR(256)',
+      '_language': 'VARCHAR(32)'
     }
     for col, spec in self.columns:
       specsql = col + ' '
       if spec[0][0] == '_':
         specsql += special_types[spec[0]]
+        if spec[0] == '_language':
+          self.needLang = True
       else:
         specsql += spec[0]
       if len(spec) > 1 and spec[1] == False:
@@ -171,17 +175,17 @@ def get_create_sql(self):
     sql += '\n);\n'
     return sql
 
-  def get_data_sql(self, blobtbl):
+  def get_data_sql(self, blobtbl, language):
     it = isinstance(blobtbl, dict) and blobtbl.itervalues() or blobtbl
     colset = set(col[0] for col in self.columns)
     for row in it:
+      if self.needLang: row['language'] = language;
       # Only add the keys in the columns
       keys = colset.intersection(row.iterkeys())
       args = tuple(row[k] for k in keys)
       yield ('INSERT OR IGNORE INTO %s (%s) VALUES (%s);' % (self.name,
         ','.join(keys), ','.join('?' for k in keys)), args)
 
-
 def make_get_schema_func(schema):
   """ Returns a function that satisfies get_schema's contract from the given
       schema object. """
@@ -193,3 +197,10 @@ def get_schema():
 def required_exports():
   """ Returns the required exports for a module, for use as __all__. """
   return ['post_process', 'sqlify', 'can_use', 'get_htmlifiers', 'get_schema']
+
+last_id = 0
+def next_global_id():
+  """ Returns a unique identifier that is unique compared to other IDs. """
+  global last_id
+  last_id += 1
+  return last_id
diff --git a/xref-tools/cxx-clang/indexer.py b/xref-tools/cxx-clang/indexer.py
@@ -79,42 +79,36 @@ def recanon_decl(name, loc):
 
   # Produce all scopes
   scopes = {}
-  nextIndex = 1
   typeKeys = set()
   for t in types:
     key = canonicalize_decl(t[0], t[1])
     if key not in types:
       key = recanon_decl(t[0], t[1])
     if key not in scopes:
       typeKeys.add(key)
-      types[key]['tid'] = scopes[key] = nextIndex
-      nextIndex += 1
+      types[key]['tid'] = scopes[key] = dxr.plugins.next_global_id()
   # Typedefs need a tid, but they are not a scope
   for t in typedefs:
-    typedefs[t]['tid'] = nextIndex
-    nextIndex += 1
+    typedefs[t]['tid'] = dxr.plugins.next_global_id()
   funcKeys = set()
   for f in functions:
     key = canonicalize_decl(f[0], f[1])
     if key not in functions:
       key = recanon_decl(f[0], f[1])
     if key not in scopes:
       funcKeys.add(key)
-      functions[key]['funcid'] = scopes[key] = nextIndex
-      nextIndex += 1
+      functions[key]['funcid'] = scopes[key] = dxr.plugins.next_global_id()
 
   # Variables aren't scoped, but we still need to refer to them in the same
   # manner, so we'll unify variables with the scope ids
   varKeys = {}
   for v in variables:
     key = (v[0], v[1])
     if key not in varKeys:
-      varKeys[key] = variables[v]['varid'] = nextIndex
-      nextIndex += 1
+      varKeys[key] = variables[v]['varid'] = dxr.plugins.next_global_id()
 
   for m in macros:
-    macros[m]['macroid'] = nextIndex
-    nextIndex += 1
+    macros[m]['macroid'] = dxr.plugins.next_global_id()
 
   # Scopes are now defined, this allows us to modify structures for sql prep
 
@@ -282,16 +276,6 @@ def can_use(treecfg):
   return dxr.plugins.in_path('clang') and dxr.plugins.in_path('llvm-config')
 
 schema = dxr.plugins.Schema({
-  # Scope definitions: a scope is anything that is both interesting (i.e., not
-  # a namespace) and can contain other objects. The IDs for this scope should be
-  # IDs in other tables as well; the table its in can disambiguate which type of
-  # scope you're looking at.
-  "scopes": [
-    ("scopeid", "INTEGER", False),   # An ID for this scope
-    ("sname", "VARCHAR(256)", True), # Name of the scope
-    ("sloc", "_location", True),     # Location of the canonical decl
-    ("_key", "scopeid")
-  ],
   # Type definitions: anything that defines a type per the relevant specs.
   "types": [
     ("tid", "INTEGER", False),            # Unique ID for the type
@@ -301,42 +285,9 @@ def can_use(treecfg):
     ("tloc", "_location", False),         # Location of canonical decl
     ("tkind", "VARCHAR(32)", True),       # Kind of type (e.g., class, union)
     ("ttypedef", "VARCHAR(256)", True),   # Type (if this is a typedef)
+    ("language", "_language", True),      # Language of the type
     ("_key", "tid")
   ],
-  # Inheritance relations: note that we store the full transitive closure in
-  # this table, so if A extends B and B extends C, we'd have (A, C) stored in
-  # the table as well; this is necessary to make SQL queries work, since there's
-  # no "transitive closure lookup expression".
-  "impl": [
-    ("tbase", "INTEGER", False),      # tid of base type
-    ("tderived", "INTEGER", False),   # tid of derived type
-    ("inhtype", "VARCHAR(32)", True), # Type of inheritance; NULL is indirect
-    ("_key", "tbase", "tderived")
-  ],
-  # Functions: functions, methods, constructors, operator overloads, etc.
-  "functions": [
-    ("funcid", "INTEGER", False),         # Function ID (also in scopes)
-    ("scopeid", "INTEGER", False),        # Scope defined in
-    ("fname", "VARCHAR(256)", False),     # Short name (no args)
-    ("fqualname", "VARCHAR(512)", False), # Fully qualified name, excluding args
-    ("fargs", "VARCHAR(256)", False),     # Argument vector
-    ("ftype", "VARCHAR(256)", False),     # Full return type, as a string
-    ("floc", "_location", True),          # Location of definition
-    ("modifiers", "VARCHAR(256)", True),  # Modifiers (e.g., private)
-    ("_key", "funcid")
-  ],
-  # Variables: class, global, local, enum constants; they're all in here
-  # Variables are of course not scopes, but for ease of use, they use IDs from
-  # the same namespace, no scope will have the same ID as a variable and v.v.
-  "variables": [
-    ("varid", "INTEGER", False),         # Variable ID
-    ("scopeid", "INTEGER", False),       # Scope defined in
-    ("vname", "VARCHAR(256)", False),    # Short name
-    ("vloc", "_location", True),         # Location of definition
-    ("vtype", "VARCHAR(256)", True),     # Full type (including pointer stuff)
-    ("modifiers", "VARCHAR(256)", True), # Modifiers for the declaration
-    ("_key", "varid")
-  ],
   # References to functions, types, variables, etc.
   "refs": [
     ("refid", "INTEGER", False),      # ID of the identifier being referenced

diff --git a/xref-tools/moztools/indexer.py b/xref-tools/moztools/indexer.py
@@ -49,11 +49,9 @@ def collect_files(arg, dirname, fnames):
 
   blob = {}
   blob["interfaces"] = {}
-  nextNum = 2 ** 16
   for iface in interfaces:
     blob["interfaces"][iface] = interfaces[iface]
-    interfaces[iface]["iid"] = nextNum
-    nextNum += 1
+    interfaces[iface]["iid"] = dxr.plugins.next_global_id()
   tblmap = {
     "attributes": "attrid",
     "methods": "funcid",
@@ -63,10 +61,10 @@ def collect_files(arg, dirname, fnames):
     blob[table] = {}
     things = globals()[table]
     for thing, tinfo in things.iteritems():
-      blob[table][nextNum] = tinfo
-      tinfo[tblmap[table]] = nextNum
+      id = dxr.plugins.next_global_id()
+      blob[table][id] = tinfo
+      tinfo[tblmap[table]] = id
       tinfo["iid"] = interfaces[tinfo["iface"]]["iid"]
-      nextNum += 1
 
   # File pivoting. Joy.
   def schema():