-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
executable file
·61 lines (43 loc) · 1.36 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3
# encoding: UTF-8
"""
Filename: extract.py
Author: David Oniani
E-mail: [email protected]
Description:
Extract relevant information from the JSON files.
"""
import os
import json
import shutil
from typing import Any, Dict, List
DIR_READ: str = "data_raw"
DIR_WRITE: str = "data"
def extract(filename: str) -> List[str]:
"""Scraper for getting data from the JSON files.
Parameter(s):
filename: Specifies a name of the PDF file to be scraped
"""
with open(filename, "r") as file:
data: Dict[str, Any] = json.load(file)
abstract: List[str] = [data["abstract"][0]["text"]] if data[
"abstract"
] else []
body_text: List[str] = [item["text"] for item in data["body_text"]]
abstract.extend(body_text)
return abstract
def main() -> None:
"""The main function."""
# Extract the data
json_data: List[List[str]] = []
for filename in os.listdir(DIR_READ):
json_data.append(extract("{}/{}".format(DIR_READ, filename)))
# Create the directory with files
if os.path.exists(DIR_WRITE):
shutil.rmtree(DIR_WRITE)
os.mkdir(DIR_WRITE)
for idx, content in enumerate(json_data): # Populate
with open("{}/data_{}.txt".format(DIR_WRITE, idx), "w") as file:
file.writelines("\n".join(content))
if __name__ == "__main__":
main()