66from numpy .lib .function_base import iterable
77from pandas .api .types import CategoricalDtype
88from pandas .core .groupby import DataFrameGroupBy
9- from scipy .sparse import hstack
10- from sklearn .naive_bayes import MultinomialNB
11- from sklearn .preprocessing import OneHotEncoder
129from spacy .strings import StringStore
1310from spacy .tokens import Doc
1411
@@ -83,7 +80,7 @@ def _preprocess_data(self, corpus: Iterable[Doc]) -> pd.DataFrame:
8380 )
8481
8582 # Assign a sentence id to each token
86- df = df .groupby ("DOC_ID" ).apply (self ._retrieve_lines )
83+ df = df .groupby ("DOC_ID" , as_index = False ).apply (self ._retrieve_lines )
8784 df ["SENTENCE_ID" ] = df ["SENTENCE_ID" ].astype ("int" )
8885
8986 # Compute B1 and B2
@@ -404,6 +401,8 @@ def _fit_M1(
404401 [description]
405402
406403 """
404+ from sklearn .naive_bayes import MultinomialNB
405+
407406 # Encode classes to OneHotEncoder representation
408407 encoder_A1_A2 = self ._fit_encoder_2S (A1 , A2 )
409408 self .encoder_A1_A2 = encoder_A1_A2
@@ -427,6 +426,7 @@ def _fit_M2(self, B1: pd.Series, B2: pd.Series, label: pd.Series):
427426 B2 : pd.Series
428427 label : pd.Series
429428 """
429+ from sklearn .naive_bayes import MultinomialNB
430430
431431 # Encode classes to OneHotEncoder representation
432432 encoder_B1 = self ._fit_encoder_1S (B1 )
@@ -456,6 +456,8 @@ def _get_X_for_M1(
456456 -------
457457 np.ndarray
458458 """
459+ from scipy .sparse import hstack
460+
459461 A1_enc = self ._encode_series (self .encoder_A1_A2 , A1 )
460462 A2_enc = self ._encode_series (self .encoder_A1_A2 , A2 )
461463 A3_enc = self ._encode_series (self .encoder_A3_A4 , A3 )
@@ -475,6 +477,8 @@ def _get_X_for_M2(self, B1: pd.Series, B2: pd.Series) -> np.ndarray:
475477 -------
476478 np.ndarray
477479 """
480+ from scipy .sparse import hstack
481+
478482 B1_enc = self ._encode_series (self .encoder_B1 , B1 )
479483 B2_enc = self ._encode_series (self .encoder_B2 , B2 )
480484 X = hstack ([B1_enc , B2_enc ])
@@ -520,7 +524,7 @@ def _predict_M2(self, B1: pd.Series, B2: pd.Series) -> Dict[str, Any]:
520524 outputs = {"predictions" : predictions , "predictions_proba" : predictions_proba }
521525 return outputs
522526
523- def _fit_encoder_2S (self , S1 : pd .Series , S2 : pd .Series ) -> OneHotEncoder :
527+ def _fit_encoder_2S (self , S1 : pd .Series , S2 : pd .Series ):
524528 """Fit a one hot encoder with 2 Series. It concatenates the series and after it
525529 fits.
526530
@@ -539,7 +543,7 @@ def _fit_encoder_2S(self, S1: pd.Series, S2: pd.Series) -> OneHotEncoder:
539543 encoder = self ._fit_one_hot_encoder (S )
540544 return encoder
541545
542- def _fit_encoder_1S (self , S1 : pd .Series ) -> OneHotEncoder :
546+ def _fit_encoder_1S (self , S1 : pd .Series ):
543547 """Fit a one hot encoder with 1 Series.
544548
545549 Parameters
@@ -554,7 +558,7 @@ def _fit_encoder_1S(self, S1: pd.Series) -> OneHotEncoder:
554558 encoder = self ._fit_one_hot_encoder (_S1 )
555559 return encoder
556560
557- def _encode_series (self , encoder : OneHotEncoder , S : pd .Series ) -> np .ndarray :
561+ def _encode_series (self , encoder , S : pd .Series ) -> np .ndarray :
558562 """Use the one hot encoder to transform a series.
559563
560564 Parameters
@@ -751,7 +755,7 @@ def _get_string(cls, _id: int, string_store: StringStore) -> str:
751755 return string_store [_id ]
752756
753757 @classmethod
754- def _fit_one_hot_encoder (cls , X : np .ndarray ) -> OneHotEncoder :
758+ def _fit_one_hot_encoder (cls , X : np .ndarray ):
755759 """Fit a one hot encoder.
756760
757761 Parameters
@@ -763,6 +767,8 @@ def _fit_one_hot_encoder(cls, X: np.ndarray) -> OneHotEncoder:
763767 -------
764768 OneHotEncoder
765769 """
770+ from sklearn .preprocessing import OneHotEncoder
771+
766772 encoder = OneHotEncoder (handle_unknown = "ignore" )
767773 encoder .fit (X )
768774 return encoder
0 commit comments