Skip to content

Commit

Permalink
Merge pull request #1 from superco01/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
superco01 authored Apr 11, 2022
2 parents af47b9c + 7b07a61 commit 8c5d83c
Show file tree
Hide file tree
Showing 10 changed files with 1,673 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.idea
.target
/target
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

# Web Scraping

Programming utility to scrap data from several website in order to support faster data collection.
## Features
v1.0
- Collect data from Tokopedia(e-commerce)
- Flexible size to collect (from 1 to 100 for current version threshold)
- Export collected data to csv file format
## Prerequisites
Programming Language: Java 8 or higher (https://www.oracle.com/java/technologies/downloads/)

Dependencies:
- Jsoup latest version (https://jsoup.org/)
- Opencsv latest version (http://opencsv.sourceforge.net/)
29 changes: 29 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.brick</groupId>
<artifactId>Brick</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>5.6</version>
</dependency>
</dependencies>

</project>
14 changes: 14 additions & 0 deletions src/main/java/Application.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import model.Product;
import utils.TokopediaScraperUtil;

import java.io.IOException;
import java.util.List;

public class Application {

public static void main(String[] args) throws IOException {

List<Product> resultList = TokopediaScraperUtil.extract(100);
TokopediaScraperUtil.export(resultList, "storage/Tokopedia_Phone.csv");
}
}
11 changes: 11 additions & 0 deletions src/main/java/constants/ProductConstant.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package constants;

public class ProductConstant {

public static String NAME = "Name";
public static String DESCRIPTION = "Description";
public static String IMAGE_LINK = "Image Link";
public static String PRICE = "Price";
public static String RATING = "Rating";
public static String MERCHANT = "Merchant";
}
24 changes: 24 additions & 0 deletions src/main/java/constants/TokopediaConstant.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package constants;

public class TokopediaConstant {

public static String BASE_URL = "https://www.tokopedia.com/";
public static String CATEGORY = "p/handphone-tablet/";
public static String SUB_CATEGORY = "handphone";
public static String URL_PARAMETER = "?ob=5&page=";
public static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36";

public static String HTML_PRODUCT_LIST = "a[data-testid=lnkProductContainer]";
public static String HTML_IMG_TITLE = "img[title]";
public static String HTML_PRICE = "span[class=css-o5uqvq]";
public static String HTML_MERCHANT = "div[class=css-vbihp9]";
public static String HTML_RATING = "img[alt=star]";
public static String HTML_DESCRIPTION = "div[data-testid=lblPDPDescriptionProduk]";

public static String ALT = "alt";
public static String SRC = "src";
public static String HREF = "href";
public static String ENC = "UTF-8";
public static String SPAN = "span";
public static String RATING_RANGE = "/5";
}
71 changes: 71 additions & 0 deletions src/main/java/model/Product.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package model;

public class Product {

private String name;
private String description;
private String imageLink;
private String price;
private String rating;
private String merchant;

public String getName() {
return name;
}

public void setName(String name) {
this.name = name;
}

public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}

public String getImageLink() {
return imageLink;
}

public void setImageLink(String imageLink) {
this.imageLink = imageLink;
}

public String getPrice() {
return price;
}

public void setPrice(String price) {
this.price = price;
}

public String getRating() {
return rating;
}

public void setRating(String rating) {
this.rating = rating;
}

public String getMerchant() {
return merchant;
}

public void setMerchant(String merchant) {
this.merchant = merchant;
}

@Override
public String toString() {
return "model.Product{" +
"name='" + name + '\'' +
", description='" + description + '\'' +
", imageLink='" + imageLink + '\'' +
", price='" + price + '\'' +
", rating='" + rating + '\'' +
", merchant='" + merchant + '\'' +
'}';
}
}
94 changes: 94 additions & 0 deletions src/main/java/utils/TokopediaScraperUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package utils;

import com.opencsv.CSVWriter;
import constants.ProductConstant;
import constants.TokopediaConstant;
import model.Product;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileWriter;
import java.io.IOException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

public class TokopediaScraperUtil {

public static List<Product> extract(int size) throws IOException {

String url = TokopediaConstant.BASE_URL + TokopediaConstant.CATEGORY
+ TokopediaConstant.SUB_CATEGORY + TokopediaConstant.URL_PARAMETER;
int pageCounter = 1;
List<Product> productList = new ArrayList<>();
Product product;

while (productList.size() < size) {
Connection connection = Jsoup.connect(url + pageCounter)
.referrer(TokopediaConstant.BASE_URL + TokopediaConstant.CATEGORY + TokopediaConstant.SUB_CATEGORY)
.userAgent(TokopediaConstant.USER_AGENT);

Document doc = connection.get();
Elements productElements = doc.select(TokopediaConstant.HTML_PRODUCT_LIST);

for (Element element :
productElements) {
Elements image = element.select(TokopediaConstant.HTML_IMG_TITLE);
String decodedUrl = TokopediaConstant.BASE_URL + URLDecoder.decode
(element.attr(TokopediaConstant.HREF), TokopediaConstant.ENC)
.split(TokopediaConstant.BASE_URL)[1];
Document docDetails = Jsoup.connect(decodedUrl).userAgent(TokopediaConstant.USER_AGENT).get();
product = new Product();
product.setName(image.attr(TokopediaConstant.ALT));
product.setPrice(element.select(TokopediaConstant.HTML_PRICE).text());
product.setMerchant(Objects.requireNonNull(element.select(TokopediaConstant.HTML_MERCHANT)
.select(TokopediaConstant.SPAN).last()).text());
product.setRating(element.select(TokopediaConstant.HTML_RATING).size()
+ TokopediaConstant.RATING_RANGE);
product.setDescription(docDetails.select(TokopediaConstant.HTML_DESCRIPTION).text());
product.setImageLink(image.attr(TokopediaConstant.SRC));
productList.add(product);
pageCounter++;
if (productElements.size() >= size) {
break;
}
}
}
return productList;
}

public static void export(List<Product> productList, String fileName) throws IOException {

List<String[]> list = new ArrayList<>();
String[] header = {
ProductConstant.NAME,
ProductConstant.DESCRIPTION,
ProductConstant.IMAGE_LINK,
ProductConstant.PRICE,
ProductConstant.RATING,
ProductConstant.MERCHANT
};

list.add(header);
String[] value;
for (Product product : productList) {
value = new String[]{
product.getName(),
product.getDescription(),
product.getImageLink(),
product.getPrice(),
product.getRating(),
product.getMerchant()
};
list.add(value);
}

CSVWriter writer = new CSVWriter(new FileWriter(fileName));
writer.writeAll(list);
writer.flush();
}
}
Loading

0 comments on commit 8c5d83c

Please sign in to comment.