forked from BeastByteAI/scikit-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt_vectorizer.py
94 lines (79 loc) · 2.97 KB
/
gpt_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from __future__ import annotations
from typing import Any, List, Optional, Union
import numpy as np
import pandas as pd
from numpy import ndarray
from sklearn.base import BaseEstimator as _BaseEstimator
from sklearn.base import TransformerMixin as _TransformerMixin
from tqdm import tqdm
from skllm.openai.embeddings import get_embedding as _get_embedding
from skllm.openai.mixin import OpenAIMixin as _OAIMixin
from skllm.utils import to_numpy as _to_numpy
class GPTVectorizer(_BaseEstimator, _TransformerMixin, _OAIMixin):
"""
A class that uses OPEN AI embedding model that converts text to GPT embeddings.
Parameters
----------
openai_embedding_model : str
The OPEN AI embedding model to use. Defaults to "text-embedding-ada-002".
openai_key : str, optional
The OPEN AI key to use. Defaults to None.
openai_org : str, optional
The OPEN AI organization ID to use. Defaults to None.
"""
def __init__(
self,
openai_embedding_model: str = "text-embedding-ada-002",
openai_key: Optional[str] = None,
openai_org: Optional[str] = None,
):
self.openai_embedding_model = openai_embedding_model
self._set_keys(openai_key, openai_org)
def fit(self, X: Any = None, y: Any = None, **kwargs) -> GPTVectorizer:
"""
Fits the GPTVectorizer to the data.
This is modelled to function as the sklearn fit method.
Parameters
----------
X : Any, optional
y : Any, optional
kwargs : dict, optional
Returns
-------
self : GPTVectorizer
"""
return self
def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> ndarray:
"""
Transforms a list of strings into a list of GPT embeddings.
This is modelled to function as the sklearn transform method
Parameters
----------
X : Optional[Union[np.ndarray, pd.Series, List[str]]]
The input array of strings to transform into GPT embeddings.
Returns
-------
embeddings : np.ndarray
"""
X = _to_numpy(X)
embeddings = []
for i in tqdm(range(len(X))):
embeddings.append(
_get_embedding(X[i], self._get_openai_key(), self._get_openai_org())
)
embeddings = np.asarray(embeddings)
return embeddings
def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray:
"""
Fits and transforms a list of strings into a list of GPT embeddings.
This is modelled to function as the sklearn fit_transform method
Parameters
----------
X : Optional[Union[np.ndarray, pd.Series, List[str]]]
The input array of strings to transform into GPT embeddings.
y : Any, optional
Returns
-------
embeddings : np.ndarray
"""
return self.fit(X, y).transform(X)