From e5e7796b82518de6d965d4311f3b0a7bf8ad2545 Mon Sep 17 00:00:00 2001 From: 0xMimir Date: Tue, 28 Nov 2023 15:28:57 +0100 Subject: [PATCH] Added cron to scrape github topic repositories --- .../jobs/github_topics_scraper/contract.rs | 34 ++++ .../src/jobs/github_topics_scraper/domain.rs | 160 ++++++++++++++++++ .../infrastructure/mod.rs | 5 + .../infrastructure/repository.rs | 68 ++++++++ .../infrastructure/service.rs | 48 ++++++ .../infrastructure/sql/unscraped_projects.sql | 5 + .../sql/unscraped_repositories.sql | 6 + .../infrastructure/sql/update_statement.sql | 11 ++ .../api/src/jobs/github_topics_scraper/mod.rs | 34 ++++ backend/api/src/jobs/mod.rs | 2 + backend/libs/sdks/src/github/contract.rs | 5 + backend/libs/sdks/src/github/domain.rs | 5 + 12 files changed, 383 insertions(+) create mode 100644 backend/api/src/jobs/github_topics_scraper/contract.rs create mode 100644 backend/api/src/jobs/github_topics_scraper/domain.rs create mode 100644 backend/api/src/jobs/github_topics_scraper/infrastructure/mod.rs create mode 100644 backend/api/src/jobs/github_topics_scraper/infrastructure/repository.rs create mode 100644 backend/api/src/jobs/github_topics_scraper/infrastructure/service.rs create mode 100644 backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_projects.sql create mode 100644 backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_repositories.sql create mode 100644 backend/api/src/jobs/github_topics_scraper/infrastructure/sql/update_statement.sql create mode 100644 backend/api/src/jobs/github_topics_scraper/mod.rs diff --git a/backend/api/src/jobs/github_topics_scraper/contract.rs b/backend/api/src/jobs/github_topics_scraper/contract.rs new file mode 100644 index 0000000..a197f0d --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/contract.rs @@ -0,0 +1,34 @@ +use error::Result; +use sea_orm::prelude::Uuid; +use store::{github_projects, github_repositories}; + +#[async_trait] +pub trait DbServiceContract { + /// + /// Insert new project + /// + async fn insert_project(&self, project: github_projects::ActiveModel) -> Result<()>; + + /// + /// Insert new repository + /// + async fn insert_repository(&self, repository: github_repositories::ActiveModel) -> Result<()>; + + /// + /// Run update status of github topic repositories + /// + async fn update_projects(&self) -> Result<()>; +} + +#[async_trait] +pub trait DbRepositoryContract { + /// + /// Get all projects from github topic repositories that are not scraped + /// + async fn get_unscraped_projects(&self) -> Result>; + + /// + /// Get unscraped github topic repositories, return (Project Name, Repository Name, Project ID) + /// + async fn get_unscraped_repositories(&self) -> Result>; +} diff --git a/backend/api/src/jobs/github_topics_scraper/domain.rs b/backend/api/src/jobs/github_topics_scraper/domain.rs new file mode 100644 index 0000000..ff3797b --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/domain.rs @@ -0,0 +1,160 @@ +use super::contract::{DbRepositoryContract, DbServiceContract}; +use cronus::{Job, Schedule}; +use error::Result; +use sdks::github::{ + data::{GithubRepository, ProfileInfo}, + GithubContract, +}; +use sea_orm::{prelude::Uuid, ActiveValue::NotSet, Set}; +use store::{github_projects, github_repositories}; + +pub struct GithubTopicsScraper< + Repository: DbRepositoryContract, + Service: DbServiceContract, + Github: GithubContract, +> { + repository: Repository, + service: Service, + github: Github, +} + +impl< + Repository: DbRepositoryContract + Send + Sync + 'static, + Service: DbServiceContract + Send + Sync + 'static, + Github: GithubContract + Send + Sync + 'static, + > GithubTopicsScraper +{ + /// + /// Creates GithubTopicsScraper + /// + pub fn new(repository: Repository, service: Service, github: Github) -> Self { + Self { + repository, + service, + github, + } + } + + /// + /// Cron job that runs once a week + /// + async fn cron_job(&self) -> Result<()> { + self.handle_projects().await?; + self.handle_repositories().await?; + self.service.update_projects().await?; + Ok(()) + } + + /// + /// Download unscraped repositories + /// + async fn handle_repositories(&self) -> Result<()> { + let repositories = self.repository.get_unscraped_repositories().await?; + + for (project_name, repository_name, project_id) in repositories { + if let Err(error) = self + .handle_repository(project_name, repository_name, project_id) + .await + { + error!("{}", error); + } + } + + Ok(()) + } + + /// + /// Download repository and store it in db + /// + async fn handle_repository( + &self, + project_name: String, + repository_name: String, + project_id: Uuid, + ) -> Result<()> { + let GithubRepository { + name, + language, + stargazers_count, + forks_count, + created_at, + updated_at, + archived, + fork, + } = self + .github + .get_repository(&project_name, &repository_name) + .await?; + + let repository = github_repositories::ActiveModel { + id: NotSet, + project: Set(project_id), + repository_name: Set(name), + language: Set(language), + stargazers_count: Set(stargazers_count), + forks_count: Set(forks_count), + created_at: Set(created_at), + updated_at: Set(updated_at), + archived: Set(archived), + fork: Set(fork), + }; + + self.service.insert_repository(repository).await?; + Ok(()) + } + + /// + /// Download unscraped projects + /// + async fn handle_projects(&self) -> Result<()> { + let projects = self.repository.get_unscraped_projects().await?; + for project in projects { + if let Err(error) = self.handle_project(project).await { + error!("{}", error); + } + } + + Ok(()) + } + + /// + /// Download project and store it in db + /// + async fn handle_project(&self, project: String) -> Result<()> { + let ProfileInfo { + profile_type, + followers, + site, + } = self.github.get_profile(&project).await?; + + let project = github_projects::ActiveModel { + id: NotSet, + name: Set(project), + profile_type: Set(Some(profile_type.to_string())), + url: Set(match site.is_empty() { + true => None, + false => Some(site), + }), + followers: Set(followers), + }; + + self.service.insert_project(project).await + } +} + +#[async_trait] +impl Job for GithubTopicsScraper +where + Repository: DbRepositoryContract + Send + Sync + 'static, + Service: DbServiceContract + Send + Sync + 'static, + Github: GithubContract + Send + Sync + 'static, +{ + fn schedule(&self) -> Schedule { + "0 0 0 * * Mon".parse().expect("Invalid schedule") + } + async fn job(&self) { + if let Err(error) = self.cron_job().await { + error!("{}", error); + } + } +} diff --git a/backend/api/src/jobs/github_topics_scraper/infrastructure/mod.rs b/backend/api/src/jobs/github_topics_scraper/infrastructure/mod.rs new file mode 100644 index 0000000..d2abf7b --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/infrastructure/mod.rs @@ -0,0 +1,5 @@ +mod service; +mod repository; + +pub use service::PgService; +pub use repository::PgRepository; \ No newline at end of file diff --git a/backend/api/src/jobs/github_topics_scraper/infrastructure/repository.rs b/backend/api/src/jobs/github_topics_scraper/infrastructure/repository.rs new file mode 100644 index 0000000..0854df7 --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/infrastructure/repository.rs @@ -0,0 +1,68 @@ +use error::Result; +use sea_orm::{ + prelude::Uuid, DatabaseBackend, DatabaseConnection, EntityTrait, FromQueryResult, Statement, +}; +use std::sync::Arc; + +use super::super::contract::DbRepositoryContract; +use store::topics_repositories; + +const UNSCRAPED_PROJECTS_QUERY: &str = include_str!("sql/unscraped_projects.sql"); +const UNSCRAPED_REPOSITORIES_QUERY: &str = include_str!("sql/unscraped_repositories.sql"); + +pub struct PgRepository { + conn: Arc, +} + +impl PgRepository { + pub fn new(conn: Arc) -> Self { + Self { conn } + } +} + +#[async_trait] +impl DbRepositoryContract for PgRepository { + async fn get_unscraped_projects(&self) -> Result> { + let projects = topics_repositories::Entity::find() + .from_raw_sql(Statement::from_string( + DatabaseBackend::Postgres, + UNSCRAPED_PROJECTS_QUERY, + )) + .into_model::() + .all(self.conn.as_ref()) + .await? + .into_iter() + .map(|repo| repo.project_name) + .collect(); + + Ok(projects) + } + + async fn get_unscraped_repositories(&self) -> Result> { + let repositories = topics_repositories::Entity::find() + .from_raw_sql(Statement::from_string( + DatabaseBackend::Postgres, + UNSCRAPED_REPOSITORIES_QUERY, + )) + .into_model::() + .all(self.conn.as_ref()) + .await? + .into_iter() + .map(|repo| (repo.project_name, repo.repository_name, repo.repository_owner)) + .collect(); + + Ok(repositories) + } +} + +#[derive(FromQueryResult)] +pub struct RepositoryOwner { + project_name: String, +} + +#[derive(FromQueryResult)] +pub struct Repository { + project_name: String, + repository_name: String, + repository_owner: Uuid, +} diff --git a/backend/api/src/jobs/github_topics_scraper/infrastructure/service.rs b/backend/api/src/jobs/github_topics_scraper/infrastructure/service.rs new file mode 100644 index 0000000..d9f9978 --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/infrastructure/service.rs @@ -0,0 +1,48 @@ +use error::Result; +use sea_orm::{DatabaseConnection, EntityTrait, Statement, ConnectionTrait}; +use std::sync::Arc; + +use super::super::contract::DbServiceContract; +use store::{github_projects, github_repositories}; + +const UPDATE_STATEMENT: &str = include_str!("sql/update_statement.sql"); + +pub struct PgService { + conn: Arc, +} + +impl PgService { + pub fn new(conn: Arc) -> Self { + Self { conn } + } +} + +#[async_trait] +impl DbServiceContract for PgService { + async fn insert_project(&self, project: github_projects::ActiveModel) -> Result<()> { + github_projects::Entity::insert(project) + .exec(self.conn.as_ref()) + .await?; + + Ok(()) + } + + async fn insert_repository(&self, repository: github_repositories::ActiveModel) -> Result<()> { + github_repositories::Entity::insert(repository) + .exec(self.conn.as_ref()) + .await?; + + Ok(()) + } + + async fn update_projects(&self) -> Result<()> { + self.conn + .as_ref() + .execute(Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + UPDATE_STATEMENT, + )) + .await?; + Ok(()) + } +} diff --git a/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_projects.sql b/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_projects.sql new file mode 100644 index 0000000..e466c91 --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_projects.sql @@ -0,0 +1,5 @@ +select distinct tr.repository_owner as project_name +from topics_repositories tr + full outer join github_projects gp on gp."name" = tr.repository_owner +where tr.scraped = false + and gp.id isnull \ No newline at end of file diff --git a/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_repositories.sql b/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_repositories.sql new file mode 100644 index 0000000..f8dc1da --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_repositories.sql @@ -0,0 +1,6 @@ +select tr.repository_owner as project_name, + tr.repository_name as repository_name, + gp.id as repository_owner +from topics_repositories tr + inner join github_projects gp on gp."name" = tr.repository_owner +where tr.scraped = false \ No newline at end of file diff --git a/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/update_statement.sql b/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/update_statement.sql new file mode 100644 index 0000000..e261c94 --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/infrastructure/sql/update_statement.sql @@ -0,0 +1,11 @@ +update topics_repositories +set scraped = true +from ( + select tr.id + from topics_repositories tr + inner join github_projects gp on gp."name" = tr.repository_owner + inner join github_repositories gr on gr.repository_name = tr.repository_name + and gr.project = gp.id + where not tr.scraped + ) as subquery +where topics_repositories.id = subquery.id \ No newline at end of file diff --git a/backend/api/src/jobs/github_topics_scraper/mod.rs b/backend/api/src/jobs/github_topics_scraper/mod.rs new file mode 100644 index 0000000..11332f7 --- /dev/null +++ b/backend/api/src/jobs/github_topics_scraper/mod.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use cronus::Cronus; +use sdks::github::Github; +use sea_orm::DatabaseConnection; + +use self::{ + domain::GithubTopicsScraper, + infrastructure::{PgRepository, PgService}, +}; + +mod contract; +mod domain; +mod infrastructure; + +pub fn setup(cron: &Cronus, sea_pool: Arc) { + let job = create_gr(sea_pool); + cron.add(job).expect("Error adding job"); +} + +fn create_gr( + sea_pool: Arc, +) -> GithubTopicsScraper { + let repository = PgRepository::new(sea_pool.clone()); + let service = PgService::new(sea_pool); + let github_api_key = config::get("GITHUB_KEY").ok(); + + let github = match github_api_key { + Some(api_key) => Github::new_with_auth(api_key), + None => Github::default(), + }; + + GithubTopicsScraper::new(repository, service, github) +} diff --git a/backend/api/src/jobs/mod.rs b/backend/api/src/jobs/mod.rs index 2d64d69..52fad33 100644 --- a/backend/api/src/jobs/mod.rs +++ b/backend/api/src/jobs/mod.rs @@ -8,6 +8,7 @@ pub mod info; mod github_issues; mod github_project_info; mod github_topics; +mod github_topics_scraper; pub fn setup(sea_pool: Arc) -> Cronus { let cron = Cronus::new(); @@ -16,6 +17,7 @@ pub fn setup(sea_pool: Arc) -> Cronus { github_repositories::setup(&cron, sea_pool.clone()); github_project_info::setup(&cron, sea_pool.clone()); github_topics::setup(&cron, sea_pool.clone()); + github_topics_scraper::setup(&cron, sea_pool.clone()); github_issues::setup(&cron, sea_pool); cron diff --git a/backend/libs/sdks/src/github/contract.rs b/backend/libs/sdks/src/github/contract.rs index af67ca5..bd0d148 100644 --- a/backend/libs/sdks/src/github/contract.rs +++ b/backend/libs/sdks/src/github/contract.rs @@ -33,4 +33,9 @@ pub trait GithubContract { /// Get repositories for github topic, max page is 50 /// async fn get_topic_repositories(&self, topic: &str, page: u8) -> Result>; + + /// + /// Get repository info + /// + async fn get_repository(&self, project: &str, repository: &str) -> Result; } diff --git a/backend/libs/sdks/src/github/domain.rs b/backend/libs/sdks/src/github/domain.rs index de3705d..58aa75a 100644 --- a/backend/libs/sdks/src/github/domain.rs +++ b/backend/libs/sdks/src/github/domain.rs @@ -125,6 +125,11 @@ impl GithubContract for Github { .map(|topic| topic.ok_or(Error::InternalServer("Error parsing repository".to_owned()))) .try_collect() } + + async fn get_repository(&self, project: &str, repository: &str) -> Result { + let url = format!("https://api.github.com/repos/{}/{}", project, repository); + self.get(url).await + } } impl Github {