Skip to content

Commit

Permalink
Code cleanup and addition of MeetingGeneratorMap to match the existin…
Browse files Browse the repository at this point in the history
…g interface for meeting generators
  • Loading branch information
sibange committed Sep 20, 2024
1 parent 884ded1 commit 1c5fe13
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 180 deletions.
4 changes: 4 additions & 0 deletions mms_msg/sampling/pattern/meeting/state_based/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
from . import sampler
from . import transition_model
from . import weighted_meeting_sampler

import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
62 changes: 36 additions & 26 deletions mms_msg/sampling/pattern/meeting/state_based/action_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,20 @@ class DistributionActionHandler(ActionHandler):
When the offset is calculated from these intermediate values then VAD data is also taken into account,
when available.
Important! Due to the internal selection of the samples the distribution of the offsets depends
on the given input dataset. When using the statistics from a dataset are used and then this dataset is used as
input dataset the mean of the resulting overlap distribution is typically smaller than the original.
This effect increases when the process of sampling and generation is done multiple times.
Thus, this is not recommended.
Important! When selecting a fitting source for the OV action, the overlap is computed after a source is selected.
Due to this the resulting overlap distribution depends heavily on the length of the samples
in the given input dataset.
This also leads to the effect, that when using an input dataset with similar mean sample length,
the resulting overlap distribution is skewed toward smaller values.
When now the processes of sampling and generation is done recursively multiple times with the same input dataset
(generate dataset with action handler, use this as source dataset, repeat,...),
the resulting overlap distribution gets smaller, with each iteration. Thus, this is not recommended.
Properties:
overlap_sampler: Used sampler for the overlap
silence_sampler: Used sampler for the silence
backchannel_start_sampler: Used sampler for offset off the backchannel source
backchannel_start_sampler: Used sampler for offset of the backchannel source
border_margin: Used as minimal overlap during the OV action and minimal spacing
of the backchannel source from the borders of the foreground source.
use_vad: Is VAD data present in the given datasets and should this data be used for determining sources
Expand Down Expand Up @@ -150,7 +154,7 @@ def __init__(self, overlap_sampler: OverlapSampler, silence_sampler: SilenceSamp
self._scenario_ids = None
self._example_id = None

self._last_foreground_speaker = None
self._last_foreground_scenario = None

self._base_examples = None
self._grouped_datasets = None
Expand All @@ -176,7 +180,7 @@ def start(self, example_id: str, scenario_ids: List[str], base_examples: List[Di
# Adding the first speaker
current_source = copy.deepcopy(base_examples[scenario_id_index])
offset = 0
self._last_foreground_speaker = scenario_ids[scenario_id_index]
self._last_foreground_scenario = scenario_ids[scenario_id_index]

return True, current_source, offset, None

Expand Down Expand Up @@ -236,13 +240,13 @@ def _sample_source(self, current_scenario: str, current_dataset: Union[Dict, Dat
segment_idx: int) -> Dict[str, Any]:
"""
Internal function that samples a source from the current scenario from the current dataset using
the random round-robin method. The archive consistency for multiple executions all previously
sampled examples and the index of the current exampled are used as seed for the random number generator.
the random round-robin method. To achieve consistency for multiple executions all previously
sampled examples and the index of the current examples are used as seed for the random number generator.
Args:
current_scenario: Scenario from which the source should be sampled
current_dataset: Dataset from which the source should be sampled
examples: List of previously sampled sources (used as seed for rng)
examples: List of previously sampled sources
segment_idx: Index of the currently sampled source (used as seed for rng)
Returns: Dictionary which represents the sampled source
Expand Down Expand Up @@ -276,7 +280,7 @@ def _action_th_ts(self, current_scenario: str, current_dataset: Union[Dict, Data

current_source = self._sample_source(current_scenario, current_dataset, examples, segment_idx)
silence = self.silence_sampler(get_rng(self._example_id, segment_idx, 'silence'))
self._last_foreground_speaker = current_scenario
self._last_foreground_scenario = current_scenario
offset = max([x['speaker_end'][source_key] for x in examples]) + silence

return current_source, offset
Expand All @@ -303,7 +307,7 @@ def _action_ov(self, current_scenario: str, current_dataset: Union[Dict, Dataset
overlap = self.overlap_sampler(examples, current_source,
rng=get_rng(self._example_id, segment_idx, 'overlap'),
use_vad=self.use_vad)
self._last_foreground_speaker = current_scenario
self._last_foreground_scenario = current_scenario

offset = max([x['speaker_end'][source_key] for x in examples]) - overlap

Expand All @@ -328,15 +332,17 @@ def _action_bc(self, current_scenario: str, current_dataset: Union[Dict, Dataset
Returns: Tuple of the sampled source and the corresponding offset
"""

last_foreground_example = list(filter(lambda x: x['speaker_id'] == self._last_foreground_speaker, examples))[-1]
last_foreground_example = list(filter(lambda x: x['scenario'] == self._last_foreground_scenario, examples))[-1]

backchannel_speaker_ends = [x['speaker_end'] for x in
list(filter(lambda x: not x['speaker_id'] == self._last_foreground_speaker,
examples))]
backchannel_speaker_ends = [
x['speaker_end'][source_key]
for x in examples
if x['scenario'] != self._last_foreground_scenario
]

foreground_length = last_foreground_example['num_samples'][source_key]
free_backchannel_length = last_foreground_example['speaker_end'][source_key] - max(
[x[source_key] for x in backchannel_speaker_ends] + [0])
free_backchannel_length = (last_foreground_example['speaker_end'][source_key]
- max(backchannel_speaker_ends + [0]))

max_allowed_length = min(foreground_length, free_backchannel_length) - 2 * self.bc_border_margin

Expand All @@ -346,8 +352,8 @@ def _action_bc(self, current_scenario: str, current_dataset: Union[Dict, Dataset
current_source = copy.deepcopy(current_source)

if current_source is not None:
min_possible_start_offset = max(max([x[source_key] for x in backchannel_speaker_ends] + [0]),
last_foreground_example['offset'][source_key]) \
min_possible_start_offset = max(backchannel_speaker_ends + [0] +
[last_foreground_example['offset'][source_key]]) \
+ self.bc_border_margin
max_possible_start_offset = last_foreground_example['speaker_end'][source_key] - \
current_source['num_samples'][source_key2] - self.bc_border_margin
Expand All @@ -373,7 +379,7 @@ def example_id(self) -> str:

@property
def last_foreground_speaker(self) -> str:
return self._last_foreground_speaker
return self._last_foreground_scenario

@property
def grouped_datasets(self) -> Dict[str, Union[Dict, Dataset]]:
Expand All @@ -400,18 +406,22 @@ def rejection_sampling(rng: np.random.Generator, current_scenario: str, current_
Returns: source: when its fitting, None: when no fitting source is found
"""

for tries in range(max_tries):
sequence = [x['example_id'] for x in examples if x['scenario'] == current_scenario]
rejected_sources = []

for _ in range(max_tries):
current_source_id = sequence_sampling.sample_random_round_robin(
current_dataset[current_scenario].keys(),
sequence=[
x['example_id'] for x in examples
if x['scenario'] == current_scenario],
sequence=sequence + rejected_sources,
rng=rng
)
current_source = copy.deepcopy(current_dataset[current_scenario][current_source_id])

if current_source['num_samples']['observation'] >= min_length and (
max_length is None or current_source['num_samples']['observation'] <= max_length):
return current_source
else:
rejected_sources.append(current_source['example_id'])

# When no fitting source is found None is returned
return None
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@


logger = logging.getLogger('dataset_statistics_estimation')
logger.setLevel(logging.INFO)
if sys.stdout not in [handler.stream for handler in logger.handlers if type(handler) is logging.StreamHandler]:
logger.addHandler(logging.StreamHandler(sys.stdout))


class MeetingStatisticsEstimatorMarkov:
Expand Down Expand Up @@ -91,14 +88,11 @@ def fit(self, dataset: [Dataset, Dict], use_vad: bool = False) -> None:
silence_durations = []
overlap_durations = []

n = 0

num_speakers = 0

for sample in self._dataset:
for n, sample in enumerate(self._dataset):
if n % 100 == 0:
logger.info(f'Processed samples: {n}')
n += 1

# Depending on the usage of VAD data different keys are used
if use_vad:
Expand All @@ -115,32 +109,30 @@ def fit(self, dataset: [Dataset, Dict], use_vad: bool = False) -> None:
last_foreground_end = speaker_ends[0]
last_foreground_speaker = speaker_ids[0]

for i in range(len(offsets) - 1):
for speaker_id, offset, speaker_end in list(zip(speaker_ids, offsets, speaker_ends))[1:]:
state_occurence_counter[current_state] += 1
# Turn-hold
if last_foreground_speaker == speaker_ids[i + 1]:
if offsets[i + 1] - last_foreground_end < 0:
input()
if last_foreground_speaker == speaker_id:
new_state = 0
silence_durations.append(offsets[i + 1] - last_foreground_end)
silence_durations.append(offset - last_foreground_end)

# Turn-switch
elif last_foreground_end < offsets[i + 1]:
elif last_foreground_end < offset:
new_state = 1
silence_durations.append(offsets[i + 1] - last_foreground_end)
silence_durations.append(offset - last_foreground_end)

# Overlap
elif last_foreground_end < speaker_ends[i + 1]:
elif last_foreground_end < speaker_end:
new_state = 2
overlap_durations.append(last_foreground_end - offsets[i + 1])
overlap_durations.append(last_foreground_end - offset)
# Backchannel
else:
new_state = 3

# Adjust foreground information, in all states except backchannel
if new_state in {0, 1, 2}:
last_foreground_end = speaker_ends[i + 1]
last_foreground_speaker = speaker_ids[i + 1]
if new_state in (0, 1, 2):
last_foreground_end = speaker_end
last_foreground_speaker = speaker_id

state_transition_counter[current_state][new_state] += 1

Expand Down
58 changes: 50 additions & 8 deletions mms_msg/sampling/pattern/meeting/state_based/meeting_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from mms_msg.sampling.pattern.meeting.state_based.action_handler import DistributionActionHandler
from mms_msg.sampling.pattern.meeting.state_based.sampler import DistributionSilenceSampler, DistributionOverlapSampler
from lazy_dataset import Dataset
from typing import Dict, Type, Optional
from typing import Dict, Type, Optional, Any
import mms_msg


Expand All @@ -13,7 +13,7 @@ class MeetingGenerator:
The samples that are used to generate the artificial data is from an input dataset
that can be independent of the dataset the state transitions are estimates from.
This class used a Markov based model for the different transitions of the speakers and tries to balance
This class uses a Markov based model for the different transitions of the speakers and tries to balance
the activity of all speakers in each meeting.
Properties:
Expand Down Expand Up @@ -57,7 +57,7 @@ def fit(self, source_dataset: [Dict, Dataset], use_vad: [bool] = False) -> None:
self.overlap = DistributionOverlapSampler(max_concurrent_spk=2, distribution=db_sampler.overlap_distribution)

def generate(self, input_dataset: [Dict, Dataset], num_speakers: int = 2, duration: int = 960000,
num_meetings: Optional[int] = None, use_vad: [bool] = False) -> Dataset:
num_meetings: Optional[int] = None, use_vad: bool = False) -> Dataset:
"""Generate a dataset of artificial meeting, with sources from the input_dataset.
The distribution of the generated dataset follows the last fitted distribution,
so the fit method must be called at least once before calling this method.
Expand All @@ -77,12 +77,12 @@ def generate(self, input_dataset: [Dict, Dataset], num_speakers: int = 2, durati
"""

if self.model is None or self.silence is None or self.overlap is None:
raise Exception('No dataset is fitted, you have to use the fit method first.')
raise ValueError('No dataset is fitted, you have to use the fit method first.')

if self.model.num_speakers != num_speakers:
try:
self.model.change_num_speakers(num_speakers)
except SystemError:
except TypeError:
print('Cannot change the number of speakers of the transition model.'
'It is possible that the generation fails for the desired number of speakers.')

Expand All @@ -91,10 +91,52 @@ def generate(self, input_dataset: [Dict, Dataset], num_speakers: int = 2, durati
if num_meetings is not None:
ds = ds[:num_meetings]

ds = ds.map(mms_msg.sampling.environment.scaling.UniformScalingSampler())
ds = ds.map(mms_msg.sampling.environment.noise.UniformSNRSampler(20.0, 30.0))

return ds.map(WeightedMeetingSampler(transition_model=self.model, duration=duration,
action_handler=DistributionActionHandler(overlap_sampler=self.overlap,
silence_sampler=self.silence),
use_vad=use_vad)({'*': input_dataset}))


class MeetingGeneratorMap:
"""Class for generating meetings that aim to replicate the state transition probabilities of another dataset.
Can be mapped to an existing dataset created with get_composition_dataset()
to generate a meeting for each example in the dataset.
This class uses a Markov based model for the different transitions of the speakers and tries to balance
the activity of all speakers in each meeting.
Properties:
meeting_sampler: Weighted meeting sampler initialized with the statistics from the source dataset
which uses samples form the input dataset.
"""

def __init__(self, source_dataset: [Dict, Dataset], input_dataset: [Dict, Dataset], duration: int = 960000,
use_vad: bool = False, estimator_class: Type = MeetingStatisticsEstimatorMarkov):
"""
Initialize the Meeting Generator Map
Args:
source_dataset: Dataset for which the statistics are estimated.
This data can then be used to generate new meetings.
input_dataset: Dataset from which the sources are drawn, that are used for generation new meetings
duration: Duration that the newly generated examples should roughly have, can be slightly exceeded.
use_vad: Should VAD data be used. When set to true VAD data is used,
during generation of the new dataset and the output dataset hat also VAD information.
estimator_class: Class that should be used to determine the statistics on the input dataset.
The constructor must accept at least two parameters: dataset, use_vad
Also must have the following properties: model, silence_distribution, overlap_distribution
"""
db_sampler = estimator_class(dataset=source_dataset, use_vad=use_vad)

model = db_sampler.model

silence = DistributionSilenceSampler(distribution=db_sampler.silence_distribution)
overlap = DistributionOverlapSampler(max_concurrent_spk=2, distribution=db_sampler.overlap_distribution)

self.meeting_sampler = WeightedMeetingSampler(transition_model=model, duration=duration,
action_handler=DistributionActionHandler(overlap_sampler=overlap,
silence_sampler=silence),
use_vad=use_vad)({'*': input_dataset})

def __call__(self, example: Dict[str, Any]) -> Dict[str, Any]:
return self.meeting_sampler(example)
Loading

0 comments on commit 1c5fe13

Please sign in to comment.