Skip to content

Commit

Permalink
Added cron to scrape github topic repositories
Browse files Browse the repository at this point in the history
  • Loading branch information
0xMimir committed Nov 28, 2023
1 parent 1061ad9 commit e5e7796
Show file tree
Hide file tree
Showing 12 changed files with 383 additions and 0 deletions.
34 changes: 34 additions & 0 deletions backend/api/src/jobs/github_topics_scraper/contract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use error::Result;
use sea_orm::prelude::Uuid;
use store::{github_projects, github_repositories};

#[async_trait]
pub trait DbServiceContract {
///
/// Insert new project
///
async fn insert_project(&self, project: github_projects::ActiveModel) -> Result<()>;

///
/// Insert new repository
///
async fn insert_repository(&self, repository: github_repositories::ActiveModel) -> Result<()>;

///
/// Run update status of github topic repositories
///
async fn update_projects(&self) -> Result<()>;
}

#[async_trait]
pub trait DbRepositoryContract {
///
/// Get all projects from github topic repositories that are not scraped
///
async fn get_unscraped_projects(&self) -> Result<Vec<String>>;

///
/// Get unscraped github topic repositories, return (Project Name, Repository Name, Project ID)
///
async fn get_unscraped_repositories(&self) -> Result<Vec<(String, String, Uuid)>>;
}
160 changes: 160 additions & 0 deletions backend/api/src/jobs/github_topics_scraper/domain.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
use super::contract::{DbRepositoryContract, DbServiceContract};
use cronus::{Job, Schedule};
use error::Result;
use sdks::github::{
data::{GithubRepository, ProfileInfo},
GithubContract,
};
use sea_orm::{prelude::Uuid, ActiveValue::NotSet, Set};
use store::{github_projects, github_repositories};

pub struct GithubTopicsScraper<
Repository: DbRepositoryContract,
Service: DbServiceContract,
Github: GithubContract,
> {
repository: Repository,
service: Service,
github: Github,
}

impl<
Repository: DbRepositoryContract + Send + Sync + 'static,
Service: DbServiceContract + Send + Sync + 'static,
Github: GithubContract + Send + Sync + 'static,
> GithubTopicsScraper<Repository, Service, Github>
{
///
/// Creates GithubTopicsScraper
///
pub fn new(repository: Repository, service: Service, github: Github) -> Self {
Self {
repository,
service,
github,
}
}

///
/// Cron job that runs once a week
///
async fn cron_job(&self) -> Result<()> {
self.handle_projects().await?;
self.handle_repositories().await?;
self.service.update_projects().await?;
Ok(())
}

///
/// Download unscraped repositories
///
async fn handle_repositories(&self) -> Result<()> {
let repositories = self.repository.get_unscraped_repositories().await?;

for (project_name, repository_name, project_id) in repositories {
if let Err(error) = self
.handle_repository(project_name, repository_name, project_id)
.await
{
error!("{}", error);
}
}

Ok(())
}

///
/// Download repository and store it in db
///
async fn handle_repository(
&self,
project_name: String,
repository_name: String,
project_id: Uuid,
) -> Result<()> {
let GithubRepository {
name,
language,
stargazers_count,
forks_count,
created_at,
updated_at,
archived,
fork,
} = self
.github
.get_repository(&project_name, &repository_name)
.await?;

let repository = github_repositories::ActiveModel {
id: NotSet,
project: Set(project_id),
repository_name: Set(name),
language: Set(language),
stargazers_count: Set(stargazers_count),
forks_count: Set(forks_count),
created_at: Set(created_at),
updated_at: Set(updated_at),
archived: Set(archived),
fork: Set(fork),
};

self.service.insert_repository(repository).await?;
Ok(())
}

///
/// Download unscraped projects
///
async fn handle_projects(&self) -> Result<()> {
let projects = self.repository.get_unscraped_projects().await?;
for project in projects {
if let Err(error) = self.handle_project(project).await {
error!("{}", error);
}
}

Ok(())
}

///
/// Download project and store it in db
///
async fn handle_project(&self, project: String) -> Result<()> {
let ProfileInfo {
profile_type,
followers,
site,
} = self.github.get_profile(&project).await?;

let project = github_projects::ActiveModel {
id: NotSet,
name: Set(project),
profile_type: Set(Some(profile_type.to_string())),
url: Set(match site.is_empty() {
true => None,
false => Some(site),
}),
followers: Set(followers),
};

self.service.insert_project(project).await
}
}

#[async_trait]
impl<Repository, Service, Github> Job for GithubTopicsScraper<Repository, Service, Github>
where
Repository: DbRepositoryContract + Send + Sync + 'static,
Service: DbServiceContract + Send + Sync + 'static,
Github: GithubContract + Send + Sync + 'static,
{
fn schedule(&self) -> Schedule {
"0 0 0 * * Mon".parse().expect("Invalid schedule")
}
async fn job(&self) {
if let Err(error) = self.cron_job().await {
error!("{}", error);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod service;
mod repository;

pub use service::PgService;
pub use repository::PgRepository;
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
use error::Result;
use sea_orm::{
prelude::Uuid, DatabaseBackend, DatabaseConnection, EntityTrait, FromQueryResult, Statement,
};
use std::sync::Arc;

use super::super::contract::DbRepositoryContract;
use store::topics_repositories;

const UNSCRAPED_PROJECTS_QUERY: &str = include_str!("sql/unscraped_projects.sql");
const UNSCRAPED_REPOSITORIES_QUERY: &str = include_str!("sql/unscraped_repositories.sql");

pub struct PgRepository {
conn: Arc<DatabaseConnection>,
}

impl PgRepository {
pub fn new(conn: Arc<DatabaseConnection>) -> Self {
Self { conn }
}
}

#[async_trait]
impl DbRepositoryContract for PgRepository {
async fn get_unscraped_projects(&self) -> Result<Vec<String>> {
let projects = topics_repositories::Entity::find()
.from_raw_sql(Statement::from_string(
DatabaseBackend::Postgres,
UNSCRAPED_PROJECTS_QUERY,
))
.into_model::<RepositoryOwner>()
.all(self.conn.as_ref())
.await?
.into_iter()
.map(|repo| repo.project_name)
.collect();

Ok(projects)
}

async fn get_unscraped_repositories(&self) -> Result<Vec<(String, String, Uuid)>> {
let repositories = topics_repositories::Entity::find()
.from_raw_sql(Statement::from_string(
DatabaseBackend::Postgres,
UNSCRAPED_REPOSITORIES_QUERY,
))
.into_model::<Repository>()
.all(self.conn.as_ref())
.await?
.into_iter()
.map(|repo| (repo.project_name, repo.repository_name, repo.repository_owner))
.collect();

Ok(repositories)
}
}

#[derive(FromQueryResult)]
pub struct RepositoryOwner {
project_name: String,
}

#[derive(FromQueryResult)]
pub struct Repository {
project_name: String,
repository_name: String,
repository_owner: Uuid,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use error::Result;
use sea_orm::{DatabaseConnection, EntityTrait, Statement, ConnectionTrait};
use std::sync::Arc;

use super::super::contract::DbServiceContract;
use store::{github_projects, github_repositories};

const UPDATE_STATEMENT: &str = include_str!("sql/update_statement.sql");

pub struct PgService {
conn: Arc<DatabaseConnection>,
}

impl PgService {
pub fn new(conn: Arc<DatabaseConnection>) -> Self {
Self { conn }
}
}

#[async_trait]
impl DbServiceContract for PgService {
async fn insert_project(&self, project: github_projects::ActiveModel) -> Result<()> {
github_projects::Entity::insert(project)
.exec(self.conn.as_ref())
.await?;

Ok(())
}

async fn insert_repository(&self, repository: github_repositories::ActiveModel) -> Result<()> {
github_repositories::Entity::insert(repository)
.exec(self.conn.as_ref())
.await?;

Ok(())
}

async fn update_projects(&self) -> Result<()> {
self.conn
.as_ref()
.execute(Statement::from_string(
sea_orm::DatabaseBackend::Postgres,
UPDATE_STATEMENT,
))
.await?;
Ok(())
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
select distinct tr.repository_owner as project_name
from topics_repositories tr
full outer join github_projects gp on gp."name" = tr.repository_owner
where tr.scraped = false
and gp.id isnull
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
select tr.repository_owner as project_name,
tr.repository_name as repository_name,
gp.id as repository_owner
from topics_repositories tr
inner join github_projects gp on gp."name" = tr.repository_owner
where tr.scraped = false
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update topics_repositories
set scraped = true
from (
select tr.id
from topics_repositories tr
inner join github_projects gp on gp."name" = tr.repository_owner
inner join github_repositories gr on gr.repository_name = tr.repository_name
and gr.project = gp.id
where not tr.scraped
) as subquery
where topics_repositories.id = subquery.id
34 changes: 34 additions & 0 deletions backend/api/src/jobs/github_topics_scraper/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use std::sync::Arc;

use cronus::Cronus;
use sdks::github::Github;
use sea_orm::DatabaseConnection;

use self::{
domain::GithubTopicsScraper,
infrastructure::{PgRepository, PgService},
};

mod contract;
mod domain;
mod infrastructure;

pub fn setup(cron: &Cronus, sea_pool: Arc<DatabaseConnection>) {
let job = create_gr(sea_pool);
cron.add(job).expect("Error adding job");
}

fn create_gr(
sea_pool: Arc<DatabaseConnection>,
) -> GithubTopicsScraper<PgRepository, PgService, Github> {
let repository = PgRepository::new(sea_pool.clone());
let service = PgService::new(sea_pool);
let github_api_key = config::get("GITHUB_KEY").ok();

let github = match github_api_key {
Some(api_key) => Github::new_with_auth(api_key),
None => Github::default(),
};

GithubTopicsScraper::new(repository, service, github)
}
2 changes: 2 additions & 0 deletions backend/api/src/jobs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub mod info;
mod github_issues;
mod github_project_info;
mod github_topics;
mod github_topics_scraper;

pub fn setup(sea_pool: Arc<DatabaseConnection>) -> Cronus {
let cron = Cronus::new();
Expand All @@ -16,6 +17,7 @@ pub fn setup(sea_pool: Arc<DatabaseConnection>) -> Cronus {
github_repositories::setup(&cron, sea_pool.clone());
github_project_info::setup(&cron, sea_pool.clone());
github_topics::setup(&cron, sea_pool.clone());
github_topics_scraper::setup(&cron, sea_pool.clone());
github_issues::setup(&cron, sea_pool);

cron
Expand Down
Loading

0 comments on commit e5e7796

Please sign in to comment.