Update Dataclasses and add helm id (stanford-crfm#1659)

Co-authored-by: Andy Z <[email protected]> Co-authored-by: Andy Z <[email protected]> Co-authored-by: Andy Z <[email protected]> Co-authored-by: Andy Z <[email protected]> Co-authored-by: Andy Z <[email protected]> Co-authored-by: Andy Z <[email protected]>
nelson-liu · Jun 21, 2023 · 2905ecc · 2905ecc
1 parent 2bb4d78
commit 2905ecc
Show file tree

Hide file tree

Showing 17 changed files with 318 additions and 1,319 deletions.
diff --git a/scripts/data_overlap/README.md b/scripts/data_overlap/README.md
@@ -1,6 +1,5 @@
 # Data Overlap Script (no dependencies on HELM)
 
-There are 2 scripts, one with and one without Apache Beam.
 
 ## Installation
 
@@ -39,34 +38,6 @@ There are additional optional args:
 --tags tag1 tag2
 ```
 
-## Beam API
-
-Model developers should implement an Apache Beam pipeline that creates a `PCollection[str]` of documents, and then pass it to `ComputeAndWriteDataOverlapStats()` with the appropriate arguments.
-
-Note: Each record in the `PCollection[str]` should contain an _entire_ document, not a single line from a document.
-
-```python
-with beam.Pipeline() as pipeline:
-    _ = (
-        pipeline
-        # The model developer should modify these lines to read from the actual training set.
-        | "Read" >> beam.io.ReadFromText(input_data)
-        | "ExtractTextFromDocument" >> beam.Map(extract_text_from_document)
-        # Call the HELM Data Overlap Apache Beam API.
-        | "ComputeAndWriteDataOverlapStats" >> ComputeAndWriteDataOverlapStats(
-            scenario_data_path=scenario_data,
-            n_values=n_values,
-            normalization=normalization,
-            tags=tags
-        )
-    )
-```
-
-## Notes
-
-The beam script does not support outputting overlapping ngrams yet.
-
-
 ## Docker
 
 To create and run docker image:

diff --git a/scripts/data_overlap/compute_data_overlap_metrics.py b/scripts/data_overlap/compute_data_overlap_metrics.py
diff --git a/scripts/data_overlap/data_overlap_beam.py b/scripts/data_overlap/data_overlap_beam.py
diff --git a/scripts/data_overlap/data_overlap_spec.py b/scripts/data_overlap/data_overlap_spec.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass
+from typing import List
+from light_scenario import LightScenarioKey
+
+
+@dataclass(frozen=True)
+class OverlapProtocolSpec:
+    """Specification for how we compute overlap"""
+
+    # the N of the n_grams we're running
+    n: int
+
+
+@dataclass(frozen=True)
+class DataOverlapStatsKey:
+    """Dataclass that represents output data overlap stats"""
+
+    light_scenario_key: LightScenarioKey
+
+    overlap_protocol_spec: OverlapProtocolSpec
+
+
+@dataclass(frozen=True)
+class DataOverlapStats:
+    """Dataclass that represents output data overlap stats"""
+
+    data_overlap_stats_key: DataOverlapStatsKey
+
+    num_instances: int
+
+    instance_ids_with_overlapping_input: List[str]
+
+    instance_ids_with_overlapping_reference: List[str]
diff --git a/scripts/data_overlap/data_overlap_stats.py b/scripts/data_overlap/data_overlap_stats.py
diff --git a/scripts/data_overlap/light_scenario.py b/scripts/data_overlap/light_scenario.py
@@ -1,8 +1,13 @@
 from dataclasses import dataclass
-from typing import List, Dict, Hashable
+from typing import List, Optional
 
+try:
+    from scenarios.scenario import ScenarioSpec
+except Exception:
+    from helm.benchmark.scenarios.scenario import ScenarioSpec
 
-@dataclass(frozen=True, eq=False)
+
+@dataclass(frozen=True)
 class LightInstance:
     """
     A lighter `Instance` with only text fields.
@@ -14,24 +19,31 @@ class LightInstance:
     references: List[str]
     """References that help us evaluate"""
 
+    id: Optional[str] = None
+    """Helm instance id"""
+
 
 @dataclass(frozen=True)
 class LightScenarioKey:
-    """Unique key representing a `LightScenario` instance."""
+    """
+    Key for LightScenario
+    """
+
+    scenario_spec: ScenarioSpec
 
-    metadata: Dict[str, Hashable]
+    split: str
 
     def __hash__(self):
-        return hash(tuple((k, self.metadata[k]) for k in sorted(self.metadata.keys())))
+        return hash((self.scenario_spec, self.split))
 
 
-@dataclass(frozen=True, eq=False)
+@dataclass(frozen=True)
 class LightScenario:
     """
     A lighter `Scenario`.
     """
 
-    light_scenario_key: LightScenarioKey
+    scenario_key: LightScenarioKey
 
-    light_instances: List[LightInstance]
+    instances: List[LightInstance]
     """Instances of this scenario"""
diff --git a/scripts/data_overlap/scenario_data b/scripts/data_overlap/scenario_data