-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enhance project structure: update setup.py with new dependencies and …
…versioning, add Dockerfile and docker-compose for containerization, implement Makefile for build automation, and introduce pre-commit hooks for code quality checks.
- Loading branch information
1 parent
e1f5ff0
commit b632911
Showing
9 changed files
with
271 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,4 +55,7 @@ build/ | |
*.log | ||
.coverage | ||
coverage.xml | ||
htmlcov/ | ||
htmlcov/ | ||
|
||
# everything else | ||
.github/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
|
||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.4.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-yaml | ||
- id: debug-statements | ||
|
||
- repo: https://github.com/psf/black | ||
rev: 23.3.0 | ||
hooks: | ||
- id: black | ||
|
||
- repo: https://github.com/pycqa/isort | ||
rev: 5.12.0 | ||
hooks: | ||
- id: isort | ||
|
||
- repo: https://github.com/pycqa/flake8 | ||
rev: 6.0.0 | ||
hooks: | ||
- id: flake8 | ||
additional_dependencies: [flake8-docstrings] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
FROM python:3.8-slim | ||
|
||
WORKDIR /app | ||
|
||
# Install system dependencies | ||
RUN apt-get update && apt-get install -y \ | ||
gcc \ | ||
g++ \ | ||
git \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Copy requirements first for better caching | ||
COPY requirements.txt . | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Copy project files | ||
COPY . . | ||
|
||
# Set environment variables | ||
ENV PYTHONPATH=/app | ||
ENV PYTHONUNBUFFERED=1 | ||
|
||
# Run tests | ||
RUN python -m pytest tests/ | ||
|
||
# Default command | ||
CMD ["python", "-m", "src.main"] | ||
|
||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ | ||
CMD curl -f http://localhost:8000/health || exit 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
|
||
.PHONY: install test lint format clean docker-build docker-run | ||
|
||
install: | ||
pip install -r requirements.txt | ||
|
||
test: | ||
python -m pytest tests/ | ||
|
||
lint: | ||
flake8 src/ tests/ | ||
mypy src/ | ||
black --check . | ||
|
||
format: | ||
black . | ||
isort . | ||
|
||
clean: | ||
find . -type d -name "__pycache__" -exec rm -r {} + | ||
find . -type f -name "*.pyc" -delete | ||
rm -rf .coverage htmlcov/ | ||
|
||
docker-build: | ||
docker-compose build | ||
|
||
docker-run: | ||
docker-compose up | ||
|
||
.PHONY: docs | ||
docs: | ||
sphinx-build -b html docs/source docs/build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
version: '3.8' | ||
|
||
services: | ||
app: | ||
build: . | ||
volumes: | ||
- ./data:/app/data | ||
- ./outputs:/app/outputs | ||
environment: | ||
- CUDA_VISIBLE_DEVICES=0 | ||
deploy: | ||
resources: | ||
reservations: | ||
devices: | ||
- driver: nvidia | ||
count: 1 | ||
capabilities: [gpu] | ||
|
||
tests: | ||
build: . | ||
command: python -m pytest tests/ | ||
volumes: | ||
- ./tests:/app/tests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
|
||
[tool.black] | ||
line-length = 100 | ||
target-version = ['py38'] | ||
include = '\.pyi?$' | ||
|
||
[tool.pytest.ini_options] | ||
minversion = "6.0" | ||
addopts = "--cov=src" | ||
testpaths = [ | ||
"tests", | ||
] | ||
|
||
[tool.isort] | ||
profile = "black" | ||
multi_line_output = 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
|
||
[flake8] | ||
max-line-length = 100 | ||
exclude = .git,__pycache__,build,dist | ||
ignore = E203,W503 | ||
|
||
[mypy] | ||
python_version = 3.8 | ||
warn_return_any = True | ||
warn_unused_configs = True | ||
disallow_untyped_defs = True | ||
|
||
[coverage:run] | ||
source = src | ||
omit = tests/* | ||
|
||
[coverage:report] | ||
exclude_lines = | ||
pragma: no cover | ||
def __repr__ | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,16 +2,28 @@ | |
|
||
setup( | ||
name="synsearch", | ||
version="0.1", | ||
version="0.1.0", | ||
packages=find_packages(), | ||
install_requires=[ | ||
'numpy', | ||
'torch', | ||
'transformers', | ||
'scikit-learn', | ||
'hdbscan', | ||
'pyyaml', | ||
'nltk', | ||
'rouge_score' | ||
] | ||
) | ||
"numpy>=1.19.0", | ||
"torch>=2.0.0", | ||
"transformers>=4.15.0", | ||
"scikit-learn>=0.24.0", | ||
"sentence-transformers>=2.2.0", | ||
"hdbscan>=0.8.29", | ||
"plotly>=5.3.0", | ||
"streamlit>=1.2.0", | ||
"pytest>=6.0.0" | ||
], | ||
python_requires=">=3.8", | ||
author="Your Name", | ||
author_email="[email protected]", | ||
description="Dynamic Summarization and Adaptive Clustering Framework", | ||
long_description=open("README.md").read(), | ||
long_description_content_type="text/markdown", | ||
classifiers=[ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import pytest | ||
from src.clustering.dynamic_cluster_manager import DynamicClusterManager | ||
import numpy as np | ||
from src.clustering.dynamic_clusterer import DynamicClusterer | ||
from src.clustering.streaming_manager import StreamingClusterManager | ||
|
||
@pytest.fixture | ||
def sample_embeddings(): | ||
"""Generate sample embeddings with clear cluster structure.""" | ||
np.random.seed(42) | ||
n_samples = 100 | ||
n_features = 768 | ||
|
||
# Create 3 distinct clusters | ||
cluster1 = np.random.normal(0, 0.1, (n_samples // 3, n_features)) | ||
cluster2 = np.random.normal(3, 0.1, (n_samples // 3, n_features)) | ||
cluster3 = np.random.normal(-3, 0.1, (n_samples // 3, n_features)) | ||
|
||
return np.vstack([cluster1, cluster2, cluster3]) | ||
|
||
@pytest.fixture | ||
def config(): | ||
"""Sample configuration for clustering.""" | ||
return { | ||
'min_cluster_size': 5, | ||
'min_samples': 3, | ||
'similarity_threshold': 0.8 | ||
} | ||
|
||
def test_adaptive_clustering(): | ||
cluster_manager = DynamicClusterManager({'min_cluster_size': 5}) | ||
embeddings = np.random.rand(100, 768) | ||
labels = cluster_manager.fit_predict(embeddings) | ||
|
||
assert isinstance(labels, np.ndarray) | ||
assert len(labels) == len(embeddings) | ||
assert len(np.unique(labels)) > 1 | ||
|
||
def test_online_clustering(): | ||
cluster_manager = DynamicClusterManager({'online_mode': True}) | ||
initial_data = np.random.rand(50, 768) | ||
new_data = np.random.rand(10, 768) | ||
|
||
# Initial clustering | ||
initial_labels = cluster_manager.fit_predict(initial_data) | ||
# Update with new data | ||
updated_labels = cluster_manager.update(new_data) | ||
|
||
assert len(updated_labels) == len(new_data) | ||
|
||
def test_dynamic_clustering(sample_embeddings, config): | ||
"""Test dynamic clustering algorithm selection.""" | ||
clusterer = DynamicClusterer(config) | ||
labels, metrics = clusterer.fit_predict(sample_embeddings) | ||
|
||
assert isinstance(labels, np.ndarray) | ||
assert len(labels) == len(sample_embeddings) | ||
assert isinstance(metrics, dict) | ||
assert metrics['silhouette_score'] > 0.5 | ||
|
||
def test_streaming_clustering(sample_embeddings, config): | ||
"""Test streaming cluster updates.""" | ||
manager = StreamingClusterManager(config) | ||
|
||
# Split embeddings into batches | ||
batch_size = 20 | ||
n_batches = len(sample_embeddings) // batch_size | ||
|
||
all_results = [] | ||
for i in range(n_batches): | ||
start_idx = i * batch_size | ||
end_idx = start_idx + batch_size | ||
batch_embeddings = sample_embeddings[start_idx:end_idx] | ||
metadata = [{'id': j} for j in range(start_idx, end_idx)] | ||
|
||
results = manager.update(batch_embeddings, metadata) | ||
all_results.append(results) | ||
|
||
final_clusters = manager.get_clusters() | ||
|
||
assert isinstance(final_clusters, dict) | ||
assert len(final_clusters) > 0 | ||
assert all(isinstance(cluster, list) for cluster in final_clusters.values()) | ||
|
||
# Verify cluster statistics | ||
stats = all_results[-1]['stats'] | ||
assert stats['num_clusters'] > 0 | ||
assert stats['total_docs'] == len(sample_embeddings) | ||
assert stats['avg_cluster_size'] > 0 | ||
|
||
def test_empty_clustering(config): | ||
"""Test handling of empty input.""" | ||
clusterer = DynamicClusterer(config) | ||
empty_embeddings = np.array([]) | ||
|
||
with pytest.raises(ValueError): | ||
clusterer.fit_predict(empty_embeddings) |