- Threads
- Cache
- Robots.txt
- Update info after time
- Proxy
- Queue (BFS)
- Detect trackers
- Http to https
- Normalize url
- Encryption (rsa)
- API
- Proxy
- Nodes
- Rating
- Encryption (assymetric)
- Multithreading crawler
- Robots Rules (from headers & html) & crawl-delay
- Responsive web design
- Own FTS (...)
- Images Crawler
sh scripts/install_dependencies_arch.sh
sh scripts/install_dependencies_debian.sh
sh scripts/build_all.sh
mkdir /tmp/typesense-data &&
./typesense-server --data-dir=/tmp/typesense-data --api-key=xyz --enable-cors &&
sh scripts/init_db.sh
./crawler ../../sites.txt 5 ../../config.json
#[sites_path] [threads_count] [config path]
./website ../../config.json
#[config path]
./cli gnu 1 ../../config.json
#[query] [page] [config path]
//proxy: type://ip:port OR empty ("")
//socks5://127.0.0.1:9050
//_s - seconds
{
"global": {
//edit also website/frontend/js/search_encrypt.js
"rsa_key_length": 1024, //1024|2048|4096
"max_title_show_size": 55,
"max_desc_show_size": 350,
"nodes": [
{
"name": "This",
"url": "http://127.0.0.1:8080"
}
]
},
"crawler": {
"user_agent": "librengine",
"proxy": "socks5://127.0.0.1:9050",
"load_page_timeout_s": 10,
"update_time_site_info_s_after": 864000, //10 days
"delay_time_s": 3,
"max_pages_site": 5,
"max_page_symbols": 50000000, //50mb
"max_robots_txt_symbols": 3000,
"max_lru_cache_size_host": 512,
"max_lru_cache_size_url": 512,
"is_http_to_https": true,
"is_check_robots_txt": true
},
"cli": {
"proxy": "socks5://127.0.0.1:9050"
},
"website": {
"port": 8080,
"proxy": "socks5://127.0.0.1:9050"
},
//edit also init_db.sh
"db": {
"url": "http://localhost:8108",
"api_key": "xyz"
}
}
GNU Affero General Public License v3.0