我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.Series()。
def aggregate_ohlcv_panel(self, fields, ohlcv_panel, items=None, minor_axis=None): """ Convert an OHLCV Panel into a DataFrame by aggregating each field's frame into a Series. """ vals = ohlcv_panel if isinstance(ohlcv_panel, pd.Panel): vals = ohlcv_panel.values items = ohlcv_panel.items minor_axis = ohlcv_panel.minor_axis data = [ self.frame_to_series( field, vals[items.get_loc(field)], minor_axis ) for field in fields ] return np.array(data)
def to_series(tuples): """Transforms a list of tuples of the form (date, count) in to a pandas series indexed by dt. """ cleaned_time_val_tuples = [tuple for tuple in tuples if not ( tuple[0] is pd.NaT or tuple[1] is None)] if len(cleaned_time_val_tuples) > 0: # change list of tuples ie [(a1, b1), (a2, b2), ...] into # tuple of lists ie ([a1, a2, ...], [b1, b2, ...]) unzipped_cleaned_time_values = zip(*cleaned_time_val_tuples) # just being explicit about what these are counts = unzipped_cleaned_time_values[1] timestamps = unzipped_cleaned_time_values[0] # Create the series with a sorted index. ret_val = pd.Series(counts, index=timestamps).sort_index() else: ret_val = None return ret_val # In[ ]:
def __init__(self, *args, **kwargs): ''' The same arguments as for pandas.Series https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html In order to create XSeries of any data_type, data argument must be a pythons list. For example, to create XSeries of pandas.Series, pass data should be data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series. ''' super(XSeries, self).__init__(*args, **kwargs) data = kwargs.get('data') if data is None: data = args[0] check_result, data_type = _check_all_elements_have_the_same_property(data, type) if not check_result: raise ValueError('Not all elements the same type') if data_type is not None: self._data_type = data_type else: self._data_type = type(data._values[0])
def __init__(self, dictionary=None, **kwargs): ''' :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset ''' self.dictionary = dictionary accepted_types = [ pd.Series, list, np.array, tuple ] def bag_of_words_transform_function(corpus): counter = Counter(corpus) for el in self.dictionary: if counter.get(el) is None: counter[el] = 0 return counter super(BagOfWordsTransformer, self).__init__(data_types=accepted_types, columns=None, transform_function=bag_of_words_transform_function)
def test_mean_transformer(): s1 = XSeries([ pd.Series(np.random.normal(size=10)), pd.Series(np.random.normal(size=15)) ]) s2 = XSeries([ pd.Series(np.random.normal(size=10)), pd.Series(np.random.normal(size=15)), pd.Series(np.random.normal(size=100)) ]) tr = MeanSeriesTransformer() tr = tr.fit(s1) transformed_s = tr.transform(s2) assert transformed_s.shape[0] == 3 assert type(transformed_s) == XSeries
def test_mean_transformer_data_frame(): s1 = XSeries([ pd.Series(np.random.normal(size=10)), pd.Series(np.random.normal(size=15)) ]) s2 = XSeries([ pd.Series(np.random.normal(size=10)), pd.Series(np.random.normal(size=15)) ]) df = XDataFrame({ 's1': s1, 's2': s2 }) tr = MeanSeriesTransformer() try: tr = tr.fit(df) assert False except: assert True
def test_dataframe_data_types(): s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']), pd.Series([4, 5, 6], index=['d', 'e', 'g'])]) s2 = XSeries([1, 2, 3]) s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) s4 = XSeries(['f', 's', 't']) df = XDataFrame({ 'first_col': s1, 'second_col': s2, 'third_col': s3, 'fourth_col': s4 }) assert df['first_col'].data_type == pd.Series assert df['second_col'].data_type == np.int64 assert df['third_col'].data_type == dict assert df['fourth_col'].data_type == str assert type(df[['first_col']]) == XDataFrame assert type(df[['first_col', 'second_col']]) == XDataFrame
def test_dataframe_sub_frame_data_types(): s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']), pd.Series([4, 5, 6], index=['d', 'e', 'g'])]) s2 = XSeries([1, 2, 3]) s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) s4 = XSeries(['f', 's', 't']) df = XDataFrame({ 'first_col': s1, 'second_col': s2, 'third_col': s3, 'fourth_col': s4 }) sub_df = df.loc[:2] assert type(sub_df) == XDataFrame assert sub_df['first_col'].data_type == pd.Series assert sub_df['second_col'].data_type == np.int64 assert sub_df['third_col'].data_type == dict assert sub_df['fourth_col'].data_type == str assert type(sub_df[['first_col']]) == XDataFrame assert type(sub_df[['first_col', 'second_col']]) == XDataFrame
def test_series_replace_element(): s = XSeries([ pd.Series([1, 2, 3], index=['a', 'b', 'c']), pd.Series([4, 5, 6], index=['d', 'e', 'g']) ], name='MySuperSeries') try: s[0] = 111 assert False except: assert True try: s[0] = pd.Series(np.random.normal(size=100)) assert True except: assert False
def test_naming(): X = XSeries([ pd.Series(np.random.normal(0, 1, 100), name='X') ]) df = XDataFrame({ 'X': X }) dataframe_transformer = XDataFrameTransformer({ 'X': [TimeSeriesTransformer()] }) dataframe_transformer.fit(df) transformed_df = dataframe_transformer.transform(df) for col_name in transformed_df.columns: assert col_name.startswith('X_TimeSeriesTransformer')
def test_multiple_transformers_for_one_column(): X = XSeries([ pd.Series(np.random.normal(0, 1, 100), name='X') ]) df = XDataFrame({ 'X': X }) dataframe_transformer = XDataFrameTransformer({ 'X': [TimeSeriesTransformer(), IdentityTransformer(), MeanSeriesTransformer()] }) dataframe_transformer.fit(df) transformed_df = dataframe_transformer.transform(df) for col_name in transformed_df.columns: assert col_name.startswith('X_TimeSeriesTransformer') or \ col_name.startswith('X_IdentityTransformer') or \ col_name.startswith('X_MeanSeriesTransformer')
def test_ts_fresh_chain(): s1 = XSeries([ pd.Series(np.random.normal(0, 1, 20)) for _ in range(10) ], name='X') pipe = PipeLineChain([ ('mean shift', TimeSeriesWindowTransformer()), ('ts fresh step', TsFreshSeriesTransformer()) ]) pipe.fit(s1) transformed_df = pipe.transform(s1) # print(transformed_df.head()) assert type(transformed_df) == XDataFrame
def test_bfill(self): # test ndim=1 N = 100 s = pd.Series(np.random.randn(N)) mask = random.sample(range(N), 10) s.iloc[mask] = np.nan correct = s.bfill().values test = bfill(s.values) assert_almost_equal(correct, test) # test ndim=2 df = pd.DataFrame(np.random.randn(N, N)) df.iloc[mask] = np.nan correct = df.bfill().values test = bfill(df.values) assert_almost_equal(correct, test)
def test_ffill(self): # test ndim=1 N = 100 s = pd.Series(np.random.randn(N)) mask = random.sample(range(N), 10) s.iloc[mask] = np.nan correct = s.ffill().values test = ffill(s.values) assert_almost_equal(correct, test) # test ndim=2 df = pd.DataFrame(np.random.randn(N, N)) df.iloc[mask] = np.nan correct = df.ffill().values test = ffill(df.values) assert_almost_equal(correct, test)
def test_conversion_to_df(self, df, infer_timestamps): events_by_sid = {0: df} loader = EventDataSetLoader( dtx, events_by_sid, infer_timestamps=infer_timestamps, ) self.assertEqual( loader.events_by_sid.keys(), events_by_sid.keys(), ) if infer_timestamps: expected = pd.Series(index=[dtx[0]] * 10, data=dtx, name=ANNOUNCEMENT_FIELD_NAME) else: expected = pd.Series(index=dtx, data=dtx, name=ANNOUNCEMENT_FIELD_NAME) expected.index.name = TS_FIELD_NAME # Check that index by first given date has been added assert_series_equal( loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME], expected, )
def getAntennaLogs(): ''' Retrieve information about antenna changes @return dictionary of antenna changes ''' store_location = data_util.getDataLocation('ngl_gps') store = pd.HDFStore(store_location, 'r') logs_df = store['ngl_steps'] store.close() metadata = DataFetcher.getStationMetadata() logs_dict = OrderedDict() for station in metadata.index: offset_dates = logs_df[logs_df['Station']==station].index.unique() offset_dates = pd.Series(offset_dates) logs_dict[station] = offset_dates return logs_dict
def remove_error_poi_each_line(line_data): ## from 1 to len(..), because the first one is district hash ### why I need a temp_line_data here!!!! ### Please see the property of the remove() function standard_style = re.compile(r"\d+#\d+:\d+") line_data = list(line_data[0]) temp_line_data = line_data.copy() for poi_in_line in temp_line_data: if len(poi_in_line) == 32: # this is the district hash continue if not re.match(standard_style, poi_in_line): #print(poi_in_line) line_data.remove(poi_in_line) return pd.Series([line_data]) # the input line_data is a serise!!
def test_series_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({'x': np.random.randint(0, 5, size=n), 'y': np.random.normal(size=n)}) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) frags = [df.x for df in frags] appending = dgd.from_pygdf(frags[0], npartitions=1) for frag in frags[1:]: appending = appending.append(frag) appended = appending.compute().to_pandas() assert isinstance(appended, pd.Series) np.testing.assert_array_equal(appended, df.x)
def test_take(nelem, nparts): np.random.seed(0) # # Use unique index range as the sort may not be stable-ordering x = np.random.randint(0, nelem, size=nelem) y = np.random.random(nelem) selected = np.random.randint(0, nelem - 1, size=nelem // 2) df = pd.DataFrame({'x': x, 'y': y}) ddf = dd.from_pandas(df, npartitions=nparts) dgdf = dgd.from_dask_dataframe(ddf) out = dgdf.take(gd.Series(selected), npartitions=5) got = out.compute().to_pandas() expect = df.take(selected) assert 1 < out.npartitions <= 5 np.testing.assert_array_equal(got.index, np.arange(len(got))) np.testing.assert_array_equal(got.x, expect.x) np.testing.assert_array_equal(got.y, expect.y)
def set_index(self, index, drop=True, sorted=False): """Set new index. Parameters ---------- index : str or Series If a ``str`` is provided, it is used as the name of the column to be made into the index. If a ``Series`` is provided, it is used as the new index drop : bool Whether the first original index column is dropped. sorted : bool Whether the new index column is already sorted. """ if not drop: raise NotImplementedError('drop=False not supported yet') if isinstance(index, str): return self._set_index_raw(index, drop=drop, sorted=sorted) elif isinstance(index, Series): indexname = '__dask_gdf.index' df = self.assign(**{indexname: index}) return df._set_index_raw(indexname, drop=drop, sorted=sorted) else: raise TypeError('cannot set_index from {}'.format(type(index)))
def connect_actors(actor_frame, connectivity_sets, connectivity_column): """ :param actor_frame: :param connectivity_sets: :param connectivity_column: :return: Examples: same_actors = { 'ccason': [3, 14, 15], 'clipka': [4, 5, 13], 'wfpokorny': [11, 17], 'anshuarya': [0], 'bentsm': [1], 'cbarton': [2], 'dbodor': [6], 'jlecher': [7], 'jgrimbert': [8], 'nalvarez': [9], 'selvik': [10], 'wverhelst': [12], 'gryken': [16], 'github': [18]} actor_frame = connect_actors(actor_frame, same_actors, 'actor_id') """ connectivity = {} for actor_id, connectivity_set in connectivity_sets.items(): for actor in connectivity_set: connectivity[actor] = actor_id actor_frame[connectivity_column] = su.categorize(pd.Series(connectivity)) return actor_frame
def _compute_author_similarity(self, paired_authors): def row_similarity(row): same_email = row.author_email == row.author_email_other name_similarity = fuzz.token_set_ratio(row.author_name, row.author_name_other) email_name_similarity = fuzz.ratio(row.email_name, row.email_name_other) name_to_email_similarity = fuzz.token_set_ratio(row.author_name, row.name_from_email_other) return pd.Series( [same_email, name_similarity, email_name_similarity, name_to_email_similarity]) newcols = paired_authors.apply(row_similarity, axis=1) newcols.columns = ['same_email', 'name_similarity', 'email_name_similarity', 'name_to_email_similarity'] newdf = paired_authors.join(newcols) return newdf
def SMA(Series, N, M=1): ret = [] i = 1 length = len(Series) # ??X????? nan ? while i < length: if np.isnan(Series[i]): i += 1 else: break preY = Series[i] # Y' ret.append(preY) while i < length: Y = (M * Series[i] + (N - M) * preY) / float(N) ret.append(Y) preY = Y i += 1 return pd.Series(ret)
def QA_indicator_dpo(data, N=20, M=6): """ ???? ??????????????????????????????????????????? ??? ???????????????????? ???DPO???????????????????????????????????????????? ??????????????????????????????? ?20????????10????????????? ???????????????????????????????????????? ?????????????????????????????0?????????????????????0?????? ??????????????????????? """ _dpo = pd.Series(data) - pd.Series(data).rolling(N / 2 + 1).mean() _madpo = pd.Series(_dpo).rolling(M).mean() return _dpo, _madpo
def stats_desc(self,store_key,cond): ''' Args store_key (string): define which data to be analyzed in the workspace cond (string): sample observation Returns descriptive statistics ''' datas = list() for ite_file in self.store.keys(): datas.append(self.store[ite_file][store_key][str(cond)]['mean'].value) datas = pd.Series(datas) return datas.describe() # one way ANOVA # for scalar value usage only
def df_add(self,column,added_info): ''' Args column (string): the column name to be played with added_info (string, int, float or pandas.DataFrame): The information to be added to the selected column can be string, int, float, or pandas.DataFrame Returns - ''' if isinstance(added_info,str): self.data_df[column] = self.data_df[column] + self.data_df[added_info] elif isinstance(added_info,(int,float)): self.data_df[column] = self.data_df[column] + added_info elif isinstance(added_info,(pd.Series,pd.DataFrame)): self.data_df[column] = self.data_df[column] + added_info # This function performs minus to a given column
def df_minus(self,column,minus_info): ''' Args column (string): the column name to be played with minus_info (string, int, float or pandas.DataFrame): information to be subtracted from the selected column Returns - ''' if isinstance(minus_info,str): self.data_df[column] = self.data_df[column] - self.data_df[minus_info] elif isinstance(minus_info,(int,float)): self.data_df[column] = self.data_df[column] - minus_info elif isinstance(added_info,(pd.Series,pd.DataFrame)): self.data_df[column] = self.data_df[column] - added_info # This function multiplys the selected column with certain factor
def df_multiply(self,column,multiply_info): ''' Args column (string): the column name to be played with multiply_info (string, int, float or pandas.DataFrame): information to be used for multiplying Returns - ''' if isinstance(multiply_info,str): self.data_df[column] = self.data_df[column] * self.data_df[multiply_info] elif isinstance(multiply_info,(int,float)): self.data_df[column] = self.data_df[column] * multiply_info elif isinstance(added_info,(pd.Series,pd.DataFrame)): self.data_df[column] = self.data_df[column] * added_info # This function divides the selected column by certain factor
def df_division(self,column,division_info): ''' Args column (string): the column name to be played with division_info (string, int, float or pandas.DataFrame): information to be used for dividing Returns - ''' if isinstance(division_info,str): self.data_df[column] = self.data_df[column] / self.data_df[division_info] elif isinstance(division_info,(int,float)): self.data_df[column] = self.data_df[column] / float(division_info) elif isinstance(added_info,(pd.Series,pd.DataFrame)): self.data_df[column] = self.data_df[column] / added_info # delete certain trials in the data table
def stats_desc(self,store_key,cond): ''' Args store_key (string): define which data to be analyzed in the workspace cond (string): sample observation Returns descriptive statistics ''' datas = list() for ite_file in list(self.store.keys()): datas.append(self.store[ite_file][store_key][str(cond)]['mean'].value) datas = pd.Series(datas) return datas.describe() # one way ANOVA # for scalar value usage only
def df_division(self,column,division_info): ''' Args column (string): the column name to be played with division_info (string, int, float or pandas.DataFrame): information to be used for dividing Returns - ''' if isinstance(division_info,str): self.data_df[column] = self.data_df[column] / self.data_df[division_info] elif isinstance(division_info,(int,float)): self.data_df[column] = self.data_df[column] / division_info elif isinstance(added_info,(pd.Series,pd.DataFrame)): self.data_df[column] = self.data_df[column] / added_info # delete certain trials in the data table
def test_ABS(): text = """ ABS(X); """ param1 = { 'X': pd.Series([-2, -1, -0.5, 9.8]), 'RESULT': pd.Series([2, 1, 0.5, 9.8]) } param2 = { 'X': pd.Series([-2, -1, 0, 9]), 'RESULT': pd.Series([2, 1, 0, 9]) } params = [param1, param2] testfunc(text, params)
def test_SMA(): text = """ SMA(X, M, N); """ param1 = { 'X': pd.Series([10.2, 30.9, 30.48, 39.34, 43.3, 45.9, 30.48, 39.34, 45.9, 30.48, 39.34]), 'M': 5, 'N': 3, 'RESULT': pd.Series( [10.2, 24.985714, 28.507692, 35.177833, 40.101552, 43.594930, 35.713058, 37.890650, 42.697520, 35.366239, 37.750596]) } params = [param1] testfunc(text, params, True, True)
def CROSS(self, param): if not isinstance(param[0], pd.core.series.Series) and not isinstance(param[1], pd.core.series.Series): print('Invalid data type is detected.') return False if not isinstance(param[0], pd.core.series.Series): x1 = param[0] x2 = param[0] y1 = param[1].shift(1) y2 = param[1] if not isinstance(param[1], pd.core.series.Series): x1 = param[0].shift(1) x2 = param[0] y1 = param[1] y2 = param[1] if isinstance(param[0], pd.core.series.Series) and isinstance(param[1], pd.core.series.Series): x1 = param[0].shift(1) x2 = param[0] y1 = param[1].shift(1) y2 = param[1] return (x1 <= y1) & (x2 > y2)
def MAX(self, param): if isinstance(param[0], pd.core.series.Series): df = pd.DataFrame(index = param[0].index) elif isinstance(param[1], pd.core.series.Series): df = pd.DataFrame(index = param[1].index) else: df = None if df is None: return np.max(param) df['A'] = param[0] df['B'] = param[1] def callback(row): if row['A'] >= row['B']: return row['A'] else: return row['B'] result = df.apply(callback, axis = 1, reduce = True) return result
def MIN(self, param): if isinstance(param[0], pd.core.series.Series): df = pd.DataFrame(index = param[0].index) elif isinstance(param[1], pd.core.series.Series): df = pd.DataFrame(index = param[1].index) else: df = None if df is None: return np.max(param) df['A'] = param[0] df['B'] = param[1] def callback(row): if row['A'] <= row['B']: return row['A'] else: return row['B'] result = df.apply(callback, axis = 1, reduce = True) return result
def setUp(self): scores = pd.Series(np.ones(8), dtype=np.float32) np_data = np.array([ [1, 'a'], [2, 'b'], [4, 'a'], [3, 'c'], [3, 'b'], [5, 'c'], [4, 'c'], [1, 'b'], ]) col_labels = ['item_id', 'link_id'] self.input_df = pd.DataFrame(data=np_data, columns=col_labels) self.input_df['score'] = scores self.sparse = SparseTransform() self.out = self.sparse.transform(self.input_df)
def rise_rate(df): date1_2 = df[record_date].map(lambda x: str2time(x)).max() date1_1 = datetime.datetime(date1_2.year, date1_2.month, 1).date() grouped1 = DataView(df).filter_by_record_date2(date1_1, date1_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean() from dateutil.relativedelta import relativedelta date2_1 = date1_1 - relativedelta(months=+1) date2_2 = date1_2 - relativedelta(months=+1) grouped2 = DataView(df).filter_by_record_date2(date2_1, date2_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean() print(date1_1,date1_2, date2_1, date2_2) print(grouped1) print(grouped2) user_rise_rate = pd.Series(map(lambda x, y: float(x - y) / y, grouped1[power_consumption], grouped2[power_consumption])) user_rise_rate.name = 'user_rise_rate' return grouped1[[user_id]].join(user_rise_rate) # ?????
def create_agents(self, generator): """ Given information on a set of countries and a generator function, generate the agents and assign the results to ``self.agents``. :type generator: DataFrame, str, int :param generator: A function which generates the agents. """ self.generator = generator country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()]) country_array.index = range(len(country_array)) # Garbage collect before creating new processes. gc.collect() self.agents = pd.concat( self.pool.imap(self._gen_agents, np.array_split(country_array, self.processes * self.splits)) ) self.agents.index = range(len(self.agents))
def minScalErr(stec,el,z,thisBias): """ this determines the slope of the vTEC vs. Elevation line, which should be minimized in the minimum scalloping technique for receiver bias removal inputs: stec - time indexed Series of slant TEC values el - corresponding elevation values, also Series z - mapping function values to convert to vTEC from entire file, may contain nans, Series thisBias - the bias to be tested and minimized """ intel=np.asarray(el[stec.index],int) # bin the elevation values into int sTEC=np.asarray(stec,float) zmap = z[stec.index] c=np.array([(i,np.average((sTEC[intel==i]-thisBias) /zmap[intel==i])) for i in np.unique(intel) if i>30]) return np.polyfit(c[:,0],c[:,1],1)[0]
def generate_summary(df): level_counts = df.Level.value_counts().to_dict() zlist = list(zip(*[('<a href="#info">Items Processed Succesfully</a>', level_counts.get('INFO', 0)), ('<a href="#warning">Items Skipped Due to a Warning</a>', level_counts.get('WARNING', 0)), ('<a href="#error">Items Skipped Due to an Error</a>', level_counts.get('ERROR', 0))])) level_counts = pd.Series(zlist[1], index=zlist[0]) level_counts.name = "Count" info_counts = df.query("Level == 'INFO'").Message.value_counts().to_dict() zlist = list(zip(*[('No Action', info_counts.get('SKIP', 0)), ('Update', info_counts.get('UPDATE', 0)), ('Create', info_counts.get('CREATE', 0))])) info_counts = pd.Series(zlist[1], index=zlist[0]) info_counts.name = "Count" warning_counts = df.query("Level == 'WARNING'")['Msg Type'].value_counts() warning_counts.name = "Count" error_counts = df.query("Level == 'ERROR'")['Msg Type'].value_counts() error_counts.name = "Count" return level_counts, info_counts, warning_counts, error_counts
def _format_min_growth(min_growth, species): """Format min_growth into a pandas series. Arguments --------- min_growth : positive float or array-like object. The minimum growth rate for each individual in the community. Either a single value applied to all individuals or one value for each. species : array-like The ID for each individual model in the community. Returns ------- pandas.Series A pandas Series mapping each individual to its minimum growth rate. """ try: min_growth = float(min_growth) except (TypeError, ValueError): if len(min_growth) != len(species): raise ValueError( "min_growth must be single value or an array-like " "object with an entry for each species in the model.") return pd.Series(min_growth, species)
def clean_data(self, df, is_with_MICE=0): df = df.copy() if df.isnull().sum().sum() > 0: if is_with_MICE: # Imputation using MICE numerical_features_names = self.extract_numerical_features(df) df.loc[:, tuple(numerical_features_names)] = self.estimate_by_mice(df[numerical_features_names]) else: if any(tuple(df.columns == 'y')): df = df.dropna() else: df = df.dropna(1) TwoSigmaFinModTools._feature_names_num = pd.Series(data=np.intersect1d( TwoSigmaFinModTools._feature_names_num.values, df.columns), dtype=object) TwoSigmaFinModTools._numerical_feature_names = TwoSigmaFinModTools.extract_numerical_features(df) return df
def predict_job(job_list): """Assign a classification to a url""" # TODO: Add case where len is 1 or 0.... job_list = [job for j in job_list for job in j] new_job_list = [regex.tokenize_and_stem(i) for i in job_list] new_job_list = [' '.join(job) for job in new_job_list] vect = CountVectorizer() x_series = pd.Series(X) X_train_dtm = vect.fit_transform(x_series) y_train = pd.Series(y) job_list_series = pd.Series(new_job_list) job_list_dtm = vect.transform(job_list_series) nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred = nb.predict(job_list_dtm) # for i in range(len(job_list)): # print(job_list[i], y_pred[i]) return y_pred # print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
def count_pairs(data): df = pd.DataFrame(data) start, target = df.columns.tolist() # first we create groups for each pair and take size of each group as count. # counts is a pandas.Series with the pairs as index counts = df.groupby([start, target]).size() # than we remove duplicate pairs from original dateframe, # so length and counts are equal in size df = df.drop_duplicates() # reset index to values of pairs to fit index of counts df.set_index([0, 1], inplace=True, drop=False) # now we append the counts as column to the original data df[2] = pd.Series(counts.values, index=counts.index) # just cast pandas-dataframe back to numpy 2d-array usable for following # steps array = df.values return array
def _hpd_interval(self, x, width): """ Code adapted from pymc3.stats.calc_min_interval: https://github.com/pymc-devs/pymc3/blob/master/pymc3/stats.py """ x = np.sort(x) n = len(x) interval_idx_inc = int(np.floor(width * n)) n_intervals = n - interval_idx_inc interval_width = x[interval_idx_inc:] - x[:n_intervals] if len(interval_width) == 0: raise ValueError('Too few elements for interval calculation') min_idx = np.argmin(interval_width) hdi_min = x[min_idx] hdi_max = x[min_idx + interval_idx_inc] index = ['hpd{}_{}'.format(width, x) for x in ['lower', 'upper']] return pd.Series([hdi_min, hdi_max], index=index)