Skip to content

Commit

Permalink
Enhance project structure: update setup.py with new dependencies and …
Browse files Browse the repository at this point in the history
…versioning, add Dockerfile and docker-compose for containerization, implement Makefile for build automation, and introduce pre-commit hooks for code quality checks.
  • Loading branch information
stochastic-sisyphus authored Dec 10, 2024
1 parent e1f5ff0 commit b632911
Show file tree
Hide file tree
Showing 9 changed files with 271 additions and 12 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,7 @@ build/
*.log
.coverage
coverage.xml
htmlcov/
htmlcov/

# everything else
.github/
25 changes: 25 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: debug-statements

- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
- id: black

- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort

- repo: https://github.com/pycqa/flake8
rev: 6.0.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
30 changes: 30 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM python:3.8-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
git \
&& rm -rf /var/lib/apt/lists/*

# Copy requirements first for better caching
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy project files
COPY . .

# Set environment variables
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1

# Run tests
RUN python -m pytest tests/

# Default command
CMD ["python", "-m", "src.main"]

HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
32 changes: 32 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

.PHONY: install test lint format clean docker-build docker-run

install:
pip install -r requirements.txt

test:
python -m pytest tests/

lint:
flake8 src/ tests/
mypy src/
black --check .

format:
black .
isort .

clean:
find . -type d -name "__pycache__" -exec rm -r {} +
find . -type f -name "*.pyc" -delete
rm -rf .coverage htmlcov/

docker-build:
docker-compose build

docker-run:
docker-compose up

.PHONY: docs
docs:
sphinx-build -b html docs/source docs/build
23 changes: 23 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
version: '3.8'

services:
app:
build: .
volumes:
- ./data:/app/data
- ./outputs:/app/outputs
environment:
- CUDA_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

tests:
build: .
command: python -m pytest tests/
volumes:
- ./tests:/app/tests
16 changes: 16 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

[tool.black]
line-length = 100
target-version = ['py38']
include = '\.pyi?$'

[tool.pytest.ini_options]
minversion = "6.0"
addopts = "--cov=src"
testpaths = [
"tests",
]

[tool.isort]
profile = "black"
multi_line_output = 3
21 changes: 21 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

[flake8]
max-line-length = 100
exclude = .git,__pycache__,build,dist
ignore = E203,W503

[mypy]
python_version = 3.8
warn_return_any = True
warn_unused_configs = True
disallow_untyped_defs = True

[coverage:run]
source = src
omit = tests/*

[coverage:report]
exclude_lines =
pragma: no cover
def __repr__
raise NotImplementedError
34 changes: 23 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,28 @@

setup(
name="synsearch",
version="0.1",
version="0.1.0",
packages=find_packages(),
install_requires=[
'numpy',
'torch',
'transformers',
'scikit-learn',
'hdbscan',
'pyyaml',
'nltk',
'rouge_score'
]
)
"numpy>=1.19.0",
"torch>=2.0.0",
"transformers>=4.15.0",
"scikit-learn>=0.24.0",
"sentence-transformers>=2.2.0",
"hdbscan>=0.8.29",
"plotly>=5.3.0",
"streamlit>=1.2.0",
"pytest>=6.0.0"
],
python_requires=">=3.8",
author="Your Name",
author_email="[email protected]",
description="Dynamic Summarization and Adaptive Clustering Framework",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
)
97 changes: 97 additions & 0 deletions tests/test_dynamic_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import pytest
from src.clustering.dynamic_cluster_manager import DynamicClusterManager
import numpy as np
from src.clustering.dynamic_clusterer import DynamicClusterer
from src.clustering.streaming_manager import StreamingClusterManager

@pytest.fixture
def sample_embeddings():
"""Generate sample embeddings with clear cluster structure."""
np.random.seed(42)
n_samples = 100
n_features = 768

# Create 3 distinct clusters
cluster1 = np.random.normal(0, 0.1, (n_samples // 3, n_features))
cluster2 = np.random.normal(3, 0.1, (n_samples // 3, n_features))
cluster3 = np.random.normal(-3, 0.1, (n_samples // 3, n_features))

return np.vstack([cluster1, cluster2, cluster3])

@pytest.fixture
def config():
"""Sample configuration for clustering."""
return {
'min_cluster_size': 5,
'min_samples': 3,
'similarity_threshold': 0.8
}

def test_adaptive_clustering():
cluster_manager = DynamicClusterManager({'min_cluster_size': 5})
embeddings = np.random.rand(100, 768)
labels = cluster_manager.fit_predict(embeddings)

assert isinstance(labels, np.ndarray)
assert len(labels) == len(embeddings)
assert len(np.unique(labels)) > 1

def test_online_clustering():
cluster_manager = DynamicClusterManager({'online_mode': True})
initial_data = np.random.rand(50, 768)
new_data = np.random.rand(10, 768)

# Initial clustering
initial_labels = cluster_manager.fit_predict(initial_data)
# Update with new data
updated_labels = cluster_manager.update(new_data)

assert len(updated_labels) == len(new_data)

def test_dynamic_clustering(sample_embeddings, config):
"""Test dynamic clustering algorithm selection."""
clusterer = DynamicClusterer(config)
labels, metrics = clusterer.fit_predict(sample_embeddings)

assert isinstance(labels, np.ndarray)
assert len(labels) == len(sample_embeddings)
assert isinstance(metrics, dict)
assert metrics['silhouette_score'] > 0.5

def test_streaming_clustering(sample_embeddings, config):
"""Test streaming cluster updates."""
manager = StreamingClusterManager(config)

# Split embeddings into batches
batch_size = 20
n_batches = len(sample_embeddings) // batch_size

all_results = []
for i in range(n_batches):
start_idx = i * batch_size
end_idx = start_idx + batch_size
batch_embeddings = sample_embeddings[start_idx:end_idx]
metadata = [{'id': j} for j in range(start_idx, end_idx)]

results = manager.update(batch_embeddings, metadata)
all_results.append(results)

final_clusters = manager.get_clusters()

assert isinstance(final_clusters, dict)
assert len(final_clusters) > 0
assert all(isinstance(cluster, list) for cluster in final_clusters.values())

# Verify cluster statistics
stats = all_results[-1]['stats']
assert stats['num_clusters'] > 0
assert stats['total_docs'] == len(sample_embeddings)
assert stats['avg_cluster_size'] > 0

def test_empty_clustering(config):
"""Test handling of empty input."""
clusterer = DynamicClusterer(config)
empty_embeddings = np.array([])

with pytest.raises(ValueError):
clusterer.fit_predict(empty_embeddings)

0 comments on commit b632911

Please sign in to comment.