-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from superco01/development
Development
- Loading branch information
Showing
10 changed files
with
1,673 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.idea | ||
.target | ||
/target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
|
||
# Web Scraping | ||
|
||
Programming utility to scrap data from several website in order to support faster data collection. | ||
## Features | ||
v1.0 | ||
- Collect data from Tokopedia(e-commerce) | ||
- Flexible size to collect (from 1 to 100 for current version threshold) | ||
- Export collected data to csv file format | ||
## Prerequisites | ||
Programming Language: Java 8 or higher (https://www.oracle.com/java/technologies/downloads/) | ||
|
||
Dependencies: | ||
- Jsoup latest version (https://jsoup.org/) | ||
- Opencsv latest version (http://opencsv.sourceforge.net/) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>org.brick</groupId> | ||
<artifactId>Brick</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<properties> | ||
<maven.compiler.source>8</maven.compiler.source> | ||
<maven.compiler.target>8</maven.compiler.target> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.jsoup</groupId> | ||
<artifactId>jsoup</artifactId> | ||
<version>1.14.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.opencsv</groupId> | ||
<artifactId>opencsv</artifactId> | ||
<version>5.6</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import model.Product; | ||
import utils.TokopediaScraperUtil; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
|
||
public class Application { | ||
|
||
public static void main(String[] args) throws IOException { | ||
|
||
List<Product> resultList = TokopediaScraperUtil.extract(100); | ||
TokopediaScraperUtil.export(resultList, "storage/Tokopedia_Phone.csv"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package constants; | ||
|
||
public class ProductConstant { | ||
|
||
public static String NAME = "Name"; | ||
public static String DESCRIPTION = "Description"; | ||
public static String IMAGE_LINK = "Image Link"; | ||
public static String PRICE = "Price"; | ||
public static String RATING = "Rating"; | ||
public static String MERCHANT = "Merchant"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package constants; | ||
|
||
public class TokopediaConstant { | ||
|
||
public static String BASE_URL = "https://www.tokopedia.com/"; | ||
public static String CATEGORY = "p/handphone-tablet/"; | ||
public static String SUB_CATEGORY = "handphone"; | ||
public static String URL_PARAMETER = "?ob=5&page="; | ||
public static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"; | ||
|
||
public static String HTML_PRODUCT_LIST = "a[data-testid=lnkProductContainer]"; | ||
public static String HTML_IMG_TITLE = "img[title]"; | ||
public static String HTML_PRICE = "span[class=css-o5uqvq]"; | ||
public static String HTML_MERCHANT = "div[class=css-vbihp9]"; | ||
public static String HTML_RATING = "img[alt=star]"; | ||
public static String HTML_DESCRIPTION = "div[data-testid=lblPDPDescriptionProduk]"; | ||
|
||
public static String ALT = "alt"; | ||
public static String SRC = "src"; | ||
public static String HREF = "href"; | ||
public static String ENC = "UTF-8"; | ||
public static String SPAN = "span"; | ||
public static String RATING_RANGE = "/5"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package model; | ||
|
||
public class Product { | ||
|
||
private String name; | ||
private String description; | ||
private String imageLink; | ||
private String price; | ||
private String rating; | ||
private String merchant; | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
|
||
public void setName(String name) { | ||
this.name = name; | ||
} | ||
|
||
public String getDescription() { | ||
return description; | ||
} | ||
|
||
public void setDescription(String description) { | ||
this.description = description; | ||
} | ||
|
||
public String getImageLink() { | ||
return imageLink; | ||
} | ||
|
||
public void setImageLink(String imageLink) { | ||
this.imageLink = imageLink; | ||
} | ||
|
||
public String getPrice() { | ||
return price; | ||
} | ||
|
||
public void setPrice(String price) { | ||
this.price = price; | ||
} | ||
|
||
public String getRating() { | ||
return rating; | ||
} | ||
|
||
public void setRating(String rating) { | ||
this.rating = rating; | ||
} | ||
|
||
public String getMerchant() { | ||
return merchant; | ||
} | ||
|
||
public void setMerchant(String merchant) { | ||
this.merchant = merchant; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "model.Product{" + | ||
"name='" + name + '\'' + | ||
", description='" + description + '\'' + | ||
", imageLink='" + imageLink + '\'' + | ||
", price='" + price + '\'' + | ||
", rating='" + rating + '\'' + | ||
", merchant='" + merchant + '\'' + | ||
'}'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package utils; | ||
|
||
import com.opencsv.CSVWriter; | ||
import constants.ProductConstant; | ||
import constants.TokopediaConstant; | ||
import model.Product; | ||
import org.jsoup.Connection; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
import java.io.FileWriter; | ||
import java.io.IOException; | ||
import java.net.URLDecoder; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
public class TokopediaScraperUtil { | ||
|
||
public static List<Product> extract(int size) throws IOException { | ||
|
||
String url = TokopediaConstant.BASE_URL + TokopediaConstant.CATEGORY | ||
+ TokopediaConstant.SUB_CATEGORY + TokopediaConstant.URL_PARAMETER; | ||
int pageCounter = 1; | ||
List<Product> productList = new ArrayList<>(); | ||
Product product; | ||
|
||
while (productList.size() < size) { | ||
Connection connection = Jsoup.connect(url + pageCounter) | ||
.referrer(TokopediaConstant.BASE_URL + TokopediaConstant.CATEGORY + TokopediaConstant.SUB_CATEGORY) | ||
.userAgent(TokopediaConstant.USER_AGENT); | ||
|
||
Document doc = connection.get(); | ||
Elements productElements = doc.select(TokopediaConstant.HTML_PRODUCT_LIST); | ||
|
||
for (Element element : | ||
productElements) { | ||
Elements image = element.select(TokopediaConstant.HTML_IMG_TITLE); | ||
String decodedUrl = TokopediaConstant.BASE_URL + URLDecoder.decode | ||
(element.attr(TokopediaConstant.HREF), TokopediaConstant.ENC) | ||
.split(TokopediaConstant.BASE_URL)[1]; | ||
Document docDetails = Jsoup.connect(decodedUrl).userAgent(TokopediaConstant.USER_AGENT).get(); | ||
product = new Product(); | ||
product.setName(image.attr(TokopediaConstant.ALT)); | ||
product.setPrice(element.select(TokopediaConstant.HTML_PRICE).text()); | ||
product.setMerchant(Objects.requireNonNull(element.select(TokopediaConstant.HTML_MERCHANT) | ||
.select(TokopediaConstant.SPAN).last()).text()); | ||
product.setRating(element.select(TokopediaConstant.HTML_RATING).size() | ||
+ TokopediaConstant.RATING_RANGE); | ||
product.setDescription(docDetails.select(TokopediaConstant.HTML_DESCRIPTION).text()); | ||
product.setImageLink(image.attr(TokopediaConstant.SRC)); | ||
productList.add(product); | ||
pageCounter++; | ||
if (productElements.size() >= size) { | ||
break; | ||
} | ||
} | ||
} | ||
return productList; | ||
} | ||
|
||
public static void export(List<Product> productList, String fileName) throws IOException { | ||
|
||
List<String[]> list = new ArrayList<>(); | ||
String[] header = { | ||
ProductConstant.NAME, | ||
ProductConstant.DESCRIPTION, | ||
ProductConstant.IMAGE_LINK, | ||
ProductConstant.PRICE, | ||
ProductConstant.RATING, | ||
ProductConstant.MERCHANT | ||
}; | ||
|
||
list.add(header); | ||
String[] value; | ||
for (Product product : productList) { | ||
value = new String[]{ | ||
product.getName(), | ||
product.getDescription(), | ||
product.getImageLink(), | ||
product.getPrice(), | ||
product.getRating(), | ||
product.getMerchant() | ||
}; | ||
list.add(value); | ||
} | ||
|
||
CSVWriter writer = new CSVWriter(new FileWriter(fileName)); | ||
writer.writeAll(list); | ||
writer.flush(); | ||
} | ||
} |
Oops, something went wrong.