Code cleanup and addition of MeetingGeneratorMap to match the existin…

…g interface for meeting generators
fgnt · Sep 20, 2024 · 1c5fe13 · 1c5fe13
1 parent 884ded1
commit 1c5fe13
Show file tree

Hide file tree

Showing 8 changed files with 163 additions and 180 deletions.
diff --git a/mms_msg/sampling/pattern/meeting/state_based/__init__.py b/mms_msg/sampling/pattern/meeting/state_based/__init__.py
@@ -4,3 +4,7 @@
 from . import sampler
 from . import transition_model
 from . import weighted_meeting_sampler
+
+import logging
+import sys
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
diff --git a/mms_msg/sampling/pattern/meeting/state_based/action_handler.py b/mms_msg/sampling/pattern/meeting/state_based/action_handler.py
@@ -103,16 +103,20 @@ class DistributionActionHandler(ActionHandler):
     When the offset is calculated from these intermediate values then VAD data is also taken into account,
     when available.
 
-    Important! Due to the internal selection of the samples the distribution of the offsets depends
-    on the given input dataset. When using the statistics from a dataset are used and then this dataset is used as
-    input dataset the mean of the resulting overlap distribution is typically smaller than the original.
-    This effect increases when the process of sampling and generation is done multiple times.
-    Thus, this is not recommended.
+    Important! When selecting a fitting source for the OV action, the overlap is computed after a source is selected.
+    Due to this the resulting overlap distribution depends heavily on the length of the samples
+    in the given input dataset.
+    This also leads to the effect, that when using an input dataset with similar mean sample length,
+    the resulting overlap distribution is skewed toward smaller values.
+    When now the processes of sampling and generation is done recursively multiple times with the same input dataset
+    (generate dataset with action handler, use this as source dataset, repeat,...),
+    the resulting overlap distribution gets smaller, with each iteration. Thus, this is not recommended.
+
 
     Properties:
         overlap_sampler: Used sampler for the overlap
         silence_sampler: Used sampler for the silence
-        backchannel_start_sampler: Used sampler for offset off the backchannel source
+        backchannel_start_sampler: Used sampler for offset of the backchannel source
         border_margin:  Used as minimal overlap during the OV action and minimal spacing
             of the backchannel source from the borders of the foreground source.
         use_vad: Is VAD data present in the given datasets and should this data be used for determining sources
@@ -150,7 +154,7 @@ def __init__(self, overlap_sampler: OverlapSampler, silence_sampler: SilenceSamp
         self._scenario_ids = None
         self._example_id = None
 
-        self._last_foreground_speaker = None
+        self._last_foreground_scenario = None
 
         self._base_examples = None
         self._grouped_datasets = None
@@ -176,7 +180,7 @@ def start(self, example_id: str, scenario_ids: List[str], base_examples: List[Di
         # Adding the first speaker
         current_source = copy.deepcopy(base_examples[scenario_id_index])
         offset = 0
-        self._last_foreground_speaker = scenario_ids[scenario_id_index]
+        self._last_foreground_scenario = scenario_ids[scenario_id_index]
 
         return True, current_source, offset, None
 
@@ -236,13 +240,13 @@ def _sample_source(self, current_scenario: str, current_dataset: Union[Dict, Dat
                        segment_idx: int) -> Dict[str, Any]:
         """
         Internal function that samples a source from the current scenario from the current dataset using
-        the random round-robin method. The archive consistency for multiple executions all previously
-        sampled examples and the index of the current exampled are used as seed for the random number generator.
+        the random round-robin method. To achieve consistency for multiple executions all previously
+        sampled examples and the index of the current examples are used as seed for the random number generator.
 
         Args:
             current_scenario: Scenario from which the source should be sampled
             current_dataset: Dataset from which the source should be sampled
-            examples: List of previously sampled sources (used as seed for rng)
+            examples: List of previously sampled sources
             segment_idx: Index of the currently sampled source (used as seed for rng)
 
         Returns: Dictionary which represents the sampled source
@@ -276,7 +280,7 @@ def _action_th_ts(self, current_scenario: str, current_dataset: Union[Dict, Data
 
         current_source = self._sample_source(current_scenario, current_dataset, examples, segment_idx)
         silence = self.silence_sampler(get_rng(self._example_id, segment_idx, 'silence'))
-        self._last_foreground_speaker = current_scenario
+        self._last_foreground_scenario = current_scenario
         offset = max([x['speaker_end'][source_key] for x in examples]) + silence
 
         return current_source, offset
@@ -303,7 +307,7 @@ def _action_ov(self, current_scenario: str, current_dataset: Union[Dict, Dataset
         overlap = self.overlap_sampler(examples, current_source,
                                        rng=get_rng(self._example_id, segment_idx, 'overlap'),
                                        use_vad=self.use_vad)
-        self._last_foreground_speaker = current_scenario
+        self._last_foreground_scenario = current_scenario
 
         offset = max([x['speaker_end'][source_key] for x in examples]) - overlap
 
@@ -328,15 +332,17 @@ def _action_bc(self, current_scenario: str, current_dataset: Union[Dict, Dataset
         Returns: Tuple of the sampled source and the corresponding offset
         """
 
-        last_foreground_example = list(filter(lambda x: x['speaker_id'] == self._last_foreground_speaker, examples))[-1]
+        last_foreground_example = list(filter(lambda x: x['scenario'] == self._last_foreground_scenario, examples))[-1]
 
-        backchannel_speaker_ends = [x['speaker_end'] for x in
-                                    list(filter(lambda x: not x['speaker_id'] == self._last_foreground_speaker,
-                                                examples))]
+        backchannel_speaker_ends = [
+            x['speaker_end'][source_key]
+            for x in examples
+            if x['scenario'] != self._last_foreground_scenario
+        ]
 
         foreground_length = last_foreground_example['num_samples'][source_key]
-        free_backchannel_length = last_foreground_example['speaker_end'][source_key] - max(
-            [x[source_key] for x in backchannel_speaker_ends] + [0])
+        free_backchannel_length = (last_foreground_example['speaker_end'][source_key]
+                                   - max(backchannel_speaker_ends + [0]))
 
         max_allowed_length = min(foreground_length, free_backchannel_length) - 2 * self.bc_border_margin
 
@@ -346,8 +352,8 @@ def _action_bc(self, current_scenario: str, current_dataset: Union[Dict, Dataset
         current_source = copy.deepcopy(current_source)
 
         if current_source is not None:
-            min_possible_start_offset = max(max([x[source_key] for x in backchannel_speaker_ends] + [0]),
-                                            last_foreground_example['offset'][source_key]) \
+            min_possible_start_offset = max(backchannel_speaker_ends + [0] +
+                                            [last_foreground_example['offset'][source_key]]) \
                                         + self.bc_border_margin
             max_possible_start_offset = last_foreground_example['speaker_end'][source_key] - \
                 current_source['num_samples'][source_key2] - self.bc_border_margin
@@ -373,7 +379,7 @@ def example_id(self) -> str:
 
     @property
     def last_foreground_speaker(self) -> str:
-        return self._last_foreground_speaker
+        return self._last_foreground_scenario
 
     @property
     def grouped_datasets(self) -> Dict[str, Union[Dict, Dataset]]:
@@ -400,18 +406,22 @@ def rejection_sampling(rng: np.random.Generator, current_scenario: str, current_
     Returns: source: when its fitting, None: when no fitting source is found
     """
 
-    for tries in range(max_tries):
+    sequence = [x['example_id'] for x in examples if x['scenario'] == current_scenario]
+    rejected_sources = []
+
+    for _ in range(max_tries):
         current_source_id = sequence_sampling.sample_random_round_robin(
             current_dataset[current_scenario].keys(),
-            sequence=[
-                x['example_id'] for x in examples
-                if x['scenario'] == current_scenario],
+            sequence=sequence + rejected_sources,
             rng=rng
         )
         current_source = copy.deepcopy(current_dataset[current_scenario][current_source_id])
 
         if current_source['num_samples']['observation'] >= min_length and (
                 max_length is None or current_source['num_samples']['observation'] <= max_length):
             return current_source
+        else:
+            rejected_sources.append(current_source['example_id'])
+
     # When no fitting source is found None is returned
     return None
diff --git a/mms_msg/sampling/pattern/meeting/state_based/dataset_statistics_estimation.py b/mms_msg/sampling/pattern/meeting/state_based/dataset_statistics_estimation.py
@@ -11,9 +11,6 @@
 
 
 logger = logging.getLogger('dataset_statistics_estimation')
-logger.setLevel(logging.INFO)
-if sys.stdout not in [handler.stream for handler in logger.handlers if type(handler) is logging.StreamHandler]:
-    logger.addHandler(logging.StreamHandler(sys.stdout))
 
 
 class MeetingStatisticsEstimatorMarkov:
@@ -91,14 +88,11 @@ def fit(self, dataset: [Dataset, Dict], use_vad: bool = False) -> None:
         silence_durations = []
         overlap_durations = []
 
-        n = 0
-
         num_speakers = 0
 
-        for sample in self._dataset:
+        for n, sample in enumerate(self._dataset):
             if n % 100 == 0:
                 logger.info(f'Processed samples: {n}')
-            n += 1
 
             # Depending on the usage of VAD data different keys are used
             if use_vad:
@@ -115,32 +109,30 @@ def fit(self, dataset: [Dataset, Dict], use_vad: bool = False) -> None:
             last_foreground_end = speaker_ends[0]
             last_foreground_speaker = speaker_ids[0]
 
-            for i in range(len(offsets) - 1):
+            for speaker_id, offset, speaker_end in list(zip(speaker_ids, offsets, speaker_ends))[1:]:
                 state_occurence_counter[current_state] += 1
                 # Turn-hold
-                if last_foreground_speaker == speaker_ids[i + 1]:
-                    if offsets[i + 1] - last_foreground_end < 0:
-                        input()
+                if last_foreground_speaker == speaker_id:
                     new_state = 0
-                    silence_durations.append(offsets[i + 1] - last_foreground_end)
+                    silence_durations.append(offset - last_foreground_end)
 
                 # Turn-switch
-                elif last_foreground_end < offsets[i + 1]:
+                elif last_foreground_end < offset:
                     new_state = 1
-                    silence_durations.append(offsets[i + 1] - last_foreground_end)
+                    silence_durations.append(offset - last_foreground_end)
 
                 # Overlap
-                elif last_foreground_end < speaker_ends[i + 1]:
+                elif last_foreground_end < speaker_end:
                     new_state = 2
-                    overlap_durations.append(last_foreground_end - offsets[i + 1])
+                    overlap_durations.append(last_foreground_end - offset)
                 # Backchannel
                 else:
                     new_state = 3
 
                 # Adjust foreground information, in all states except backchannel
-                if new_state in {0, 1, 2}:
-                    last_foreground_end = speaker_ends[i + 1]
-                    last_foreground_speaker = speaker_ids[i + 1]
+                if new_state in (0, 1, 2):
+                    last_foreground_end = speaker_end
+                    last_foreground_speaker = speaker_id
 
                 state_transition_counter[current_state][new_state] += 1
 

diff --git a/mms_msg/sampling/pattern/meeting/state_based/meeting_generator.py b/mms_msg/sampling/pattern/meeting/state_based/meeting_generator.py
@@ -3,7 +3,7 @@
 from mms_msg.sampling.pattern.meeting.state_based.action_handler import DistributionActionHandler
 from mms_msg.sampling.pattern.meeting.state_based.sampler import DistributionSilenceSampler, DistributionOverlapSampler
 from lazy_dataset import Dataset
-from typing import Dict, Type, Optional
+from typing import Dict, Type, Optional, Any
 import mms_msg
 
 
@@ -13,7 +13,7 @@ class MeetingGenerator:
     The samples that are used to generate the artificial data is from an input dataset
     that can be independent of the dataset the state transitions are estimates from.
 
-    This class used a Markov based model for the different transitions of the speakers and tries to balance
+    This class uses a Markov based model for the different transitions of the speakers and tries to balance
     the activity of all speakers in each meeting.
 
     Properties:
@@ -57,7 +57,7 @@ def fit(self, source_dataset: [Dict, Dataset], use_vad: [bool] = False) -> None:
         self.overlap = DistributionOverlapSampler(max_concurrent_spk=2, distribution=db_sampler.overlap_distribution)
 
     def generate(self, input_dataset: [Dict, Dataset], num_speakers: int = 2, duration: int = 960000,
-                 num_meetings: Optional[int] = None, use_vad: [bool] = False) -> Dataset:
+                 num_meetings: Optional[int] = None, use_vad: bool = False) -> Dataset:
         """Generate a dataset of artificial meeting, with sources from the input_dataset.
          The distribution of the generated dataset follows the last fitted distribution,
          so the fit method must be called at least once before calling this method.
@@ -77,12 +77,12 @@ def generate(self, input_dataset: [Dict, Dataset], num_speakers: int = 2, durati
         """
 
         if self.model is None or self.silence is None or self.overlap is None:
-            raise Exception('No dataset is fitted, you have to use the fit method first.')
+            raise ValueError('No dataset is fitted, you have to use the fit method first.')
 
         if self.model.num_speakers != num_speakers:
             try:
                 self.model.change_num_speakers(num_speakers)
-            except SystemError:
+            except TypeError:
                 print('Cannot change the number of speakers of the transition model.'
                       'It is possible that the generation fails for the desired number of speakers.')
 
@@ -91,10 +91,52 @@ def generate(self, input_dataset: [Dict, Dataset], num_speakers: int = 2, durati
         if num_meetings is not None:
             ds = ds[:num_meetings]
 
-        ds = ds.map(mms_msg.sampling.environment.scaling.UniformScalingSampler())
-        ds = ds.map(mms_msg.sampling.environment.noise.UniformSNRSampler(20.0, 30.0))
-
         return ds.map(WeightedMeetingSampler(transition_model=self.model, duration=duration,
                                              action_handler=DistributionActionHandler(overlap_sampler=self.overlap,
                                                                                       silence_sampler=self.silence),
                                              use_vad=use_vad)({'*': input_dataset}))
+
+
+class MeetingGeneratorMap:
+    """Class for generating meetings that aim to replicate the state transition probabilities of another dataset.
+       Can be mapped to an existing dataset created with get_composition_dataset()
+       to generate a meeting for each example in the dataset.
+
+       This class uses a Markov based model for the different transitions of the speakers and tries to balance
+       the activity of all speakers in each meeting.
+
+       Properties:
+           meeting_sampler: Weighted meeting sampler initialized with the statistics from the source dataset
+                which uses samples form the input dataset.
+       """
+
+    def __init__(self, source_dataset: [Dict, Dataset], input_dataset: [Dict, Dataset], duration: int = 960000,
+                 use_vad: bool = False, estimator_class: Type = MeetingStatisticsEstimatorMarkov):
+        """
+        Initialize the Meeting Generator Map
+
+        Args:
+            source_dataset: Dataset for which the statistics are estimated.
+                This data can then be used to generate new meetings.
+            input_dataset: Dataset from which the sources are drawn, that are used for generation new meetings
+            duration: Duration that the newly generated examples should roughly have, can be slightly exceeded.
+            use_vad: Should VAD data be used. When set to true VAD data is used,
+                during generation of the new dataset and the output dataset hat also VAD information.
+            estimator_class: Class that should be used to determine the statistics on the input dataset.
+                The constructor must accept at least two parameters: dataset, use_vad
+                Also must have the following properties: model, silence_distribution, overlap_distribution
+        """
+        db_sampler = estimator_class(dataset=source_dataset, use_vad=use_vad)
+
+        model = db_sampler.model
+
+        silence = DistributionSilenceSampler(distribution=db_sampler.silence_distribution)
+        overlap = DistributionOverlapSampler(max_concurrent_spk=2, distribution=db_sampler.overlap_distribution)
+
+        self.meeting_sampler = WeightedMeetingSampler(transition_model=model, duration=duration,
+                                                      action_handler=DistributionActionHandler(overlap_sampler=overlap,
+                                                                                               silence_sampler=silence),
+                                                      use_vad=use_vad)({'*': input_dataset})
+
+    def __call__(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        return self.meeting_sampler(example)