-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
99 lines (85 loc) · 3.28 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from dataclasses import asdict, dataclass
from typing import Literal, NamedTuple, Self, cast, get_args
__all__: list[str] = ["Dataset"]
TDatasetType = Literal["density", "viscosity", "surften"]
@dataclass
class DatasetDetails:
"""Represents the details of a dataset in dictionary form."""
measurement_method: str | None
water_content_ppm: int | None
sample_supplier: str | None
purification_method: str | None
class DataPoint(NamedTuple):
"""Represents a single data point in a dataset."""
temperature_K: float
pressure_MPa: float
value: float
@dataclass
class Dataset:
"""Represents a single dataset."""
type: Literal["density", "viscosity", "surften"]
accepted: bool
reference: str
ionic_liquid: str
details: DatasetDetails
data: list[DataPoint]
@classmethod
def from_text(cls, text: str) -> Self:
"""Convert a text representing a single dataset into the `Dataset` instance."""
text_list = text.split("\n")
if len(text_list) < 7:
raise ValueError(
f"Text has too few rows. Expected >7, got {len(text_list)}"
)
if text_list[0][-1] == "*":
accepted = False
text_list[0] = text_list[0].replace("*", "")
else:
accepted = True
type_ = (text_list[0].split(":"))[1].strip()
type_ = cast(TDatasetType, type_)
if type_ not in list(get_args(TDatasetType)):
raise ValueError(
f"Invalid 'type' value: {type_}. "
f"Expected one of 'density', 'viscosity', 'surften'."
)
reference = text_list[1]
ionic_liquid = text_list[4]
if (text_list[5].split(";"))[1].isdigit():
water_content_ppm = int((text_list[5].split(";"))[1])
else:
water_content_ppm = None
details = DatasetDetails(
measurement_method=(text_list[5].split(";"))[0],
water_content_ppm=water_content_ppm,
sample_supplier=(text_list[5].split(";"))[2],
purification_method=(text_list[5].split(";"))[3],
)
data = []
for row in text_list[6:]:
row_values = list(map(float, row.split(" ")))
dp = DataPoint(row_values[0], row_values[1], row_values[2])
data.append(dp)
return cls(type_, accepted, reference, ionic_liquid, details, data)
@classmethod
def from_file(cls, file_path: str) -> list[Self]:
"""Convert a file containing multiple datasets into a list of `Dataset`
instances.
"""
with open(file_path, "r") as file:
file_data = [x for x in file.read().split("\n\n") if x != ""]
if file_data[-1][-1] == "\n":
file_data[-1] = file_data[-1][:-1]
datasets_collector: list[Self] = []
for text in file_data:
try:
datasets_list = cls.from_text(text)
datasets_collector.append(datasets_list)
except Exception as e:
print(f"Skipping dataset due to following error: {str(e)}")
return datasets_collector
@classmethod
def to_dict(cls, dataset: Self) -> dict:
ds_dict = asdict(dataset)
ds_dict["data"] = [x._asdict() for x in ds_dict["data"]]
return ds_dict