Skip to content

Commit

Permalink
cmt
Browse files Browse the repository at this point in the history
  • Loading branch information
Vanessa Beck committed Dec 9, 2024
1 parent 05e9a60 commit 8faecdb
Show file tree
Hide file tree
Showing 15 changed files with 651 additions and 67 deletions.
30 changes: 30 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
Dynamic Summarization and Adaptive Clustering Framework
====================================================
A framework for real-time research synthesis using dynamic clustering
and adaptive summarization techniques.
Main Components:
---------------
- Enhanced Embedding Generation
- Dynamic Clustering
- Adaptive Summarization
- Interactive Visualization
"""

__version__ = '0.1.0'
__author__ = 'Your Name'
__license__ = 'MIT'

from .embedding_generator import EnhancedEmbeddingGenerator
from .clustering.dynamic_cluster_manager import DynamicClusterManager
from .summarization.adaptive_summarizer import AdaptiveSummarizer
from .utils.style_selector import AdaptiveStyleSelector

__all__ = [
'EnhancedEmbeddingGenerator',
'DynamicClusterManager',
'AdaptiveSummarizer',
'AdaptiveStyleSelector'
]
59 changes: 54 additions & 5 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
data:
xlsum_dataset: "GEM/xlsum"
scisummnet_path: "/Users/vanessa/Dropbox/synsearch/data/scisummnet_release1.1__20190413"
scisummnet_path: "data/scisummnet_release1.1__20190413"
processed_path: "outputs/processed"

preprocessing:
Expand All @@ -14,7 +14,8 @@ preprocessing:
max_text_length: 1000

embedding:
model_name: "sentence-transformers/all-mpnet-base-v2"
model_name: "all-mpnet-base-v2"
dimension: 768
batch_size: 32
max_seq_length: 128
device: null # Will use performance.device setting
Expand All @@ -29,6 +30,10 @@ clustering:
device: null # Will use performance.device setting
num_workers: null # Will use performance.num_workers setting
output_dir: "outputs/clusters"
hybrid_mode: true
params:
n_clusters: 5
min_cluster_size: 10

checkpoints:
dir: "outputs/checkpoints"
Expand All @@ -47,6 +52,7 @@ checkpoints:
summarization:
enabled: true
save_intermediate: true
save_frequency: 100

personalization:
enabled: true
Expand All @@ -65,9 +71,46 @@ summarization:
max_length: 512
min_length: 50
batch_size: 8
enabled: true
style_params:
detailed:
max_length: 512
min_length: 150
num_beams: 4
balanced:
max_length: 384
min_length: 100
num_beams: 3
concise:
max_length: 256
min_length: 50
num_beams: 2
thresholds:
lexical_diversity:
low: 0.3
high: 0.6
variance:
low: 0.1
high: 0.3
complexity:
low: 10
high: 25
device: null # Will use performance.device setting
num_workers: null # Will use performance.num_workers setting
output_dir: "outputs/summaries"
style_thresholds:
high_variance: 0.8
high_lexical_div: 0.7
styles:
technical:
max_length: 200
focus: "terminology"
detailed:
max_length: 300
focus: "comprehensive"
balanced:
max_length: 150
focus: "general"

visualization:
enabled: true
Expand All @@ -77,14 +120,20 @@ visualization:
min_dist: 0.1
metric: "cosine"
random_state: 42
plot_types:
- "umap"
- "tsne"
- "wordcloud"

evaluation:
output_dir: "outputs/evaluation"
metrics:
- "rouge"
- "clustering"
- "silhouette"
- "davies_bouldin"
- "bleu"
- "bertscore"
thresholds:
rouge_l: 0.4
bleu: 0.3

performance:
device: null # Will auto-detect GPU/CPU
Expand Down
35 changes: 31 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
# Core Data Processing
pandas>=1.5.0
# Core dependencies
numpy>=1.21.0
scikit-learn>=1.0.0
pandas>=1.3.0
torch>=1.9.0
transformers>=4.11.0
datasets>=2.0.0
scikit-learn>=0.24.0
nltk>=3.6.0
plotly>=5.0.0
dash>=2.0.0

# New dependencies for adaptive summarization
sentence-transformers>=2.2.0
hdbscan>=0.8.28
umap-learn>=0.5.2
wordcloud>=1.8.1

# Deep Learning & Embeddings
torch>=1.9.0
Expand Down Expand Up @@ -62,4 +74,19 @@ pytest-asyncio>=0.16.0

# Multi-core CPU and GPU Optimization
concurrent-futures
multiprocessing
multiprocessing

# New dependencies for adaptive style selection
textstat>=0.7.3 # For text complexity analysis
lexical-diversity>=0.1.1 # For lexical diversity metrics
language-tool-python>=2.7.1 # For grammar checking

# Clustering and Visualization
hdbscan>=0.8.28
umap-learn>=0.5.2
plotly>=5.0.0
dash>=2.0.0

# Domain-agnostic processing
spacy>=3.2.0
textacy>=0.12.0
16 changes: 11 additions & 5 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,20 @@

from .data_loader import DataLoader
from .data_preparation import DataPreparator
from .embedding_generator import EmbeddingGenerator
from .preprocessor import TextPreprocessor
from .data_validator import DataValidator
from .embedding_generator import EnhancedEmbeddingGenerator
from .preprocessor import DomainAgnosticPreprocessor
from .clustering.dynamic_cluster_manager import DynamicClusterManager
from .summarization.adaptive_summarizer import AdaptiveSummarizer
from .utils.style_selector import AdaptiveStyleSelector

__all__ = [
'DataLoader',
'DataPreparator',
'EmbeddingGenerator',
'TextPreprocessor',
'DataValidator'
'DataValidator',
'EnhancedEmbeddingGenerator',
'DomainAgnosticPreprocessor',
'DynamicClusterManager',
'AdaptiveSummarizer',
'AdaptiveStyleSelector'
]
40 changes: 40 additions & 0 deletions src/clustering/dynamic_clusterer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from sklearn.cluster import KMeans, DBSCAN
import hdbscan
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

class DynamicClusterer:
def __init__(self, config):
self.config = config
self.metrics = {}

def select_best_algorithm(self, embeddings: np.ndarray) -> tuple:
"""Dynamically select the best clustering algorithm based on data characteristics."""
algorithms = {
'hdbscan': (hdbscan.HDBSCAN(
min_cluster_size=self.config['clustering']['min_size'],
metric='euclidean'
), True), # (algorithm, handles_noise)
'kmeans': (KMeans(
n_clusters=self.config['clustering']['n_clusters'],
random_state=42
), False)
}

best_score = -1
best_labels = None
best_algo = None

for name, (algo, handles_noise) in algorithms.items():
labels = algo.fit_predict(embeddings)
if not handles_noise: # Skip evaluation if algorithm can't handle noise
labels = labels[labels != -1]

if len(np.unique(labels)) > 1: # Only evaluate if we have valid clusters
score = silhouette_score(embeddings, labels)
if score > best_score:
best_score = score
best_labels = labels
best_algo = name

return best_labels, best_algo, best_score
28 changes: 28 additions & 0 deletions src/evaluation/eval_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Dict, List, Any
import numpy as np
from ..utils.metrics_utils import calculate_cluster_metrics, calculate_summary_metrics
from ..utils.logging_utils import MetricsLogger

class EvaluationPipeline:
def __init__(self, config: Dict[str, Any]):
self.config = config
self.logger = MetricsLogger(config)

def evaluate_clustering(self, embeddings: np.ndarray, labels: np.ndarray) -> Dict[str, float]:
"""Evaluate clustering quality."""
metrics = calculate_cluster_metrics(embeddings, labels)
self.logger.log_metrics('clustering', metrics)
return metrics

def evaluate_summaries(self,
generated_summaries: List[str],
reference_summaries: List[str]) -> Dict[str, float]:
"""Evaluate summary quality."""
metrics = {
'summary_metrics': [
calculate_summary_metrics(gen, ref)
for gen, ref in zip(generated_summaries, reference_summaries)
]
}
self.logger.log_metrics('summarization', metrics)
return metrics
40 changes: 37 additions & 3 deletions src/evaluation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,43 @@ def calculate_baseline_metrics(self, dataset_name: str, metrics: Dict) -> Dict[s
}
return baseline_metrics

def calculate_comprehensive_metrics(self, summaries, references, embeddings=None):
"""Calculate both content and semantic metrics"""
# ... implementation needed
def calculate_comprehensive_metrics(
self,
summaries: Dict[str, Dict],
references: Dict[str, str],
embeddings: Optional[Dict[str, np.ndarray]] = None
) -> Dict[str, float]:
"""Calculate comprehensive metrics including style-aware evaluation."""
metrics = {}

# Standard ROUGE scores
rouge_scores = self.calculate_rouge_scores(
[s['summary'] for s in summaries.values()],
list(references.values())
)
metrics.update(rouge_scores)

# Style-specific metrics
style_metrics = self._calculate_style_metrics(summaries)
metrics.update(style_metrics)

return metrics

def _calculate_style_metrics(
self,
summaries: Dict[str, Dict]
) -> Dict[str, float]:
"""Calculate metrics specific to different summary styles."""
style_metrics = {
'technical_accuracy': 0.0,
'conciseness_ratio': 0.0,
'detail_coverage': 0.0
}

# Implementation of style-specific metrics
# This would vary based on the style of each summary

return style_metrics

def calculate_dataset_metrics(summaries, references):
"""Calculate dataset-specific metrics"""
Expand Down
Loading

0 comments on commit 8faecdb

Please sign in to comment.