LabelEncoder
Encode labels into unique indices.
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.encode(labels)
Source code in tagifai/data.py
class LabelEncoder:
"""Encode labels into unique indices.
```python
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.encode(labels)
```
"""
def __init__(self, class_to_index: Dict = {}) -> None:
"""Initialize the label encoder.
Args:
class_to_index (Dict, optional): mapping between classes and unique indices. Defaults to {}.
"""
self.class_to_index = class_to_index or {} # mutable defaults ;)
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
def __len__(self):
return len(self.class_to_index)
def __str__(self):
return f"<LabelEncoder(num_classes={len(self)})>"
def fit(self, y: List):
"""Fit a list of labels to the encoder.
Args:
y (List): raw labels.
Returns:
Fitted LabelEncoder instance.
"""
classes = np.unique(y)
for i, class_ in enumerate(classes):
self.class_to_index[class_] = i
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
return self
def encode(self, y: List) -> np.ndarray:
"""Encode a list of raw labels.
Args:
y (List): raw labels.
Returns:
np.ndarray: encoded labels as indices.
"""
encoded = np.zeros((len(y)), dtype=int)
for i, item in enumerate(y):
encoded[i] = self.class_to_index[item]
return encoded
def decode(self, y: List) -> List:
"""Decode a list of indices.
Args:
y (List): indices.
Returns:
List: labels.
"""
classes = []
for i, item in enumerate(y):
classes.append(self.index_to_class[item])
return classes
def save(self, fp: str) -> None:
"""Save class instance to JSON file.
Args:
fp (str): filepath to save to.
"""
with open(fp, "w") as fp:
contents = {"class_to_index": self.class_to_index}
json.dump(contents, fp, indent=4, sort_keys=False)
@classmethod
def load(cls, fp: str):
"""Load instance of LabelEncoder from file.
Args:
fp (str): JSON filepath to load from.
Returns:
LabelEncoder instance.
"""
with open(fp) as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)
__init__(self, class_to_index={})
special
Initialize the label encoder.
Parameters: |
|
---|
Source code in tagifai/data.py
def __init__(self, class_to_index: Dict = {}) -> None:
"""Initialize the label encoder.
Args:
class_to_index (Dict, optional): mapping between classes and unique indices. Defaults to {}.
"""
self.class_to_index = class_to_index or {} # mutable defaults ;)
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
decode(self, y)
Decode a list of indices.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def decode(self, y: List) -> List:
"""Decode a list of indices.
Args:
y (List): indices.
Returns:
List: labels.
"""
classes = []
for i, item in enumerate(y):
classes.append(self.index_to_class[item])
return classes
encode(self, y)
Encode a list of raw labels.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def encode(self, y: List) -> np.ndarray:
"""Encode a list of raw labels.
Args:
y (List): raw labels.
Returns:
np.ndarray: encoded labels as indices.
"""
encoded = np.zeros((len(y)), dtype=int)
for i, item in enumerate(y):
encoded[i] = self.class_to_index[item]
return encoded
fit(self, y)
Fit a list of labels to the encoder.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def fit(self, y: List):
"""Fit a list of labels to the encoder.
Args:
y (List): raw labels.
Returns:
Fitted LabelEncoder instance.
"""
classes = np.unique(y)
for i, class_ in enumerate(classes):
self.class_to_index[class_] = i
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
return self
load(fp)
classmethod
Load instance of LabelEncoder from file.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
@classmethod
def load(cls, fp: str):
"""Load instance of LabelEncoder from file.
Args:
fp (str): JSON filepath to load from.
Returns:
LabelEncoder instance.
"""
with open(fp) as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)
save(self, fp)
Save class instance to JSON file.
Parameters: |
|
---|
Source code in tagifai/data.py
def save(self, fp: str) -> None:
"""Save class instance to JSON file.
Args:
fp (str): filepath to save to.
"""
with open(fp, "w") as fp:
contents = {"class_to_index": self.class_to_index}
json.dump(contents, fp, indent=4, sort_keys=False)
clean_text(text, lower, stem, stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
Clean raw text.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def clean_text(text: str, lower: bool, stem: bool, stopwords=config.STOPWORDS) -> str:
"""Clean raw text.
Args:
text (str): raw text to be cleaned.
lower (bool): whether to lowercase the text.
stem (bool): whether to stem the text.
Returns:
str: cleaned text.
"""
# Lower
if lower:
text = text.lower()
# Remove stopwords
if len(stopwords):
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub("", text)
# Spacing and filters
text = re.sub(
r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
) # add spacing between objects to be filtered
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip()
# Remove links
text = re.sub(r"http\S+", "", text)
# Stemming
if stem:
stemmer = PorterStemmer()
text = " ".join([stemmer.stem(word, to_lowercase=lower) for word in text.split(" ")])
return text
get_data_splits(X, y, train_size=0.7)
Generate balanced data splits.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def get_data_splits(X: pd.Series, y: np.ndarray, train_size: float = 0.7) -> Tuple:
"""Generate balanced data splits.
Args:
X (pd.Series): input features.
y (np.ndarray): encoded labels.
train_size (float, optional): proportion of data to use for training. Defaults to 0.7.
Returns:
Tuple: data splits as Numpy arrays.
"""
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
return X_train, X_val, X_test, y_train, y_val, y_test
preprocess(df, lower, stem, min_freq)
Preprocess the data.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def preprocess(df: pd.DataFrame, lower: bool, stem: bool, min_freq: int) -> pd.DataFrame:
"""Preprocess the data.
Args:
df (pd.DataFrame): Pandas DataFrame with data.
lower (bool): whether to lowercase the text.
stem (bool): whether to stem the text.
min_freq (int): minimum # of data points a label must have.
Returns:
pd.DataFrame: Dataframe with preprocessed data.
"""
df["text"] = df.title + " " + df.description # feature engineering
df.text = df.text.apply(clean_text, lower=lower, stem=stem) # clean text
df = replace_oos_labels(
df=df, labels=config.ACCEPTED_TAGS, label_col="tag", oos_label="other"
) # replace OOS labels
df = replace_minority_labels(
df=df, label_col="tag", min_freq=min_freq, new_label="other"
) # replace labels below min freq
return df
replace_minority_labels(df, label_col, min_freq, new_label='other')
Replace minority labels with another label.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def replace_minority_labels(
df: pd.DataFrame, label_col: str, min_freq: int, new_label: str = "other"
) -> pd.DataFrame:
"""Replace minority labels with another label.
Args:
df (pd.DataFrame): Pandas DataFrame with data.
label_col (str): name of the dataframe column that has the labels.
min_freq (int): minimum # of data points a label must have.
new_label (str, optional): name of the new label to replace minority labels. Defaults to "other".
Returns:
pd.DataFrame: Dataframe with replaced minority labels.
"""
labels = Counter(df[label_col].values)
labels_above_freq = Counter(label for label in labels.elements() if (labels[label] >= min_freq))
df[label_col] = df[label_col].apply(lambda label: label if label in labels_above_freq else None)
df[label_col] = df[label_col].fillna(new_label)
return df
replace_oos_labels(df, labels, label_col, oos_label='other')
Replace out of scope (OOS) labels.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def replace_oos_labels(
df: pd.DataFrame, labels: List, label_col: str, oos_label: str = "other"
) -> pd.DataFrame:
"""Replace out of scope (OOS) labels.
Args:
df (pd.DataFrame): Pandas DataFrame with data.
labels (List): list of accepted labels.
label_col (str): name of the dataframe column that has the labels.
oos_label (str, optional): name of the new label for OOS labels. Defaults to "other".
Returns:
pd.DataFrame: Dataframe with replaced OOS labels.
"""
oos_tags = [item for item in df[label_col].unique() if item not in labels]
df[label_col] = df[label_col].apply(lambda x: oos_label if x in oos_tags else x)
return df