我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用sklearn.ensemble.IsolationForest()。
def outlier_prediction(x_train, y_train): # Use built-in isolation forest or use predicted vs. actual # Compute squared residuals of every point # Make a threshold criteria for inclusion # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1 rng = np.random.RandomState(42) clf_all_features = IsolationForest(max_samples=100, random_state=rng) clf_all_features.fit(x_train) # Predict if a particular sample is an outlier using all features for higher dimensional data set. y_pred_train = clf_all_features.predict(x_train) # Exclude suggested outlier samples for improvement of prediction power/score outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train)) x_train_modified = x_train[outlier_map_out_train, ] y_train_modified = y_train[outlier_map_out_train, ] return x_train_modified, y_train_modified
def updateWindow(window,buf,maxContainSize): if len(buf) >= maxContainSize:#??buf?? print "buffer full " window = clusteringReminMost(window) print "window size after clustering without adding buffer :",len(window) for i in buf: window.append(i) ilf = IsolationForest(n_estimators=60) ilf.fit(window) print "isolation update finished" else: #???????buf???? print "higher than threads" for i in buf: window.append(i) ilf = IsolationForest(n_estimators=60) ilf.fit(window) print "isolation update finished" return window,ilf
def init(idlist,d,dblack,outcome,winsize=200,sleeptime = 5): #???? window = [] while True: print "fetching at %s" %ctime() data = getdata() loadvalue(data, d,dblack) outvalue = extract(d,idlist) window.append(outvalue) if len(window) > winsize: break sleep(sleeptime) #????? ilf = IsolationForest(n_estimators=60) ilf.fit(window) print ilf.predict(window) for i in ilf.predict(window): outcome.append(i) #?? return ilf,window
def updateWindow(window,buf,maxContainSize): if len(buf) >= maxContainSize:#??buf?? print "buffer full " window = clusteringReminMost(window) print "window size after clustering without adding buffer :",len(window) for i in buf: window = window.append(i) ilf = IsolationForest(n_estimators=100,verbose=2,) ilf.fit(window) print "isolation update finished" else: #???????buf???? print "higher than threads" for i in buf: window = window.append(i) ilf = IsolationForest(n_estimators=100,verbose=2,) ilf.fit(window) print "isolation update finished" return window,ilf
def updateWindow(window,buf,maxContainSize): if len(buf) >= maxContainSize:#??buf?? print window################################################ print "buffer full " window = clusteringReminMost(window) print "window size after clustering without adding buffer :",len(window) for i in buf: window.append(i) #print i ilf = IsolationForest(n_estimators=100) ilf.fit(window) print "isolation update finished" else: #???????buf???? print "higher than threads" for i in buf: window.append(i) ilf = IsolationForest(n_estimators=100) ilf.fit(window) print "isolation update finished" return window,ilf
def updateWindow(window,buf,maxContainSize): if len(buf) >= maxContainSize:#??buf?? print "buffer full " window = clusteringReminMost(window) print "window size after clustering without adding buffer :",len(window) for i in buf: window.append(i) ilf = IsolationForest(n_estimators=100,verbose=2,) ilf.fit(window) print "isolation update finished" else: #???????buf???? print "higher than threads" for i in buf: window.append(i) ilf = IsolationForest(n_estimators=100,verbose=2,) ilf.fit(window) print "isolation update finished" return window,ilf
def updateWindow(window,buf,maxContainSize): if len(buf) >= maxContainSize:#??buf?? print "buffer full " for i in buf: window.append(i) ilf = IsolationForest(n_estimators=100,contamination=0.01) ilf.fit(window) print "isolation update finished" else: #???????buf???? print "higher than threads" for i in buf: window.append(i) ilf = IsolationForest(n_estimators=100,contamination=0.01) ilf.fit(window) print "isolation update finished" return window,ilf
def init(idlist,d,dblack,outcome,winsize=200,sleeptime = 5): #???? window = [] while True: print "fetching at %s" %ctime() data = getdata() loadvalue(data, d,dblack) outvalue = extract(d,idlist) window.append(outvalue) if len(window) > winsize: break sleep(sleeptime) #????? ilf = IsolationForest(n_estimators=100,contamination=0.01) ilf.fit(window) print ilf.predict(window) for i in ilf.predict(window): outcome.append(i) #?? return ilf,window
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont): ilf = IsolationForest(n_estimators=100, contamination=cont) query = 'select * from ganglia where w_fs >0 and w_namenode>0 and w_rpc >0 limit 1024;' # ???? ??? client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb') result = client.query(query, chunked=False) data = result['ganglia'] d_sys = data[l_sys] d_namenode = data[l_namenode] d_FS = data[l_FS] d_RPC = data[l_RPC] ilf_sys = IsolationForest(n_estimators=100, contamination=cont) ilf_namenode = IsolationForest(n_estimators=100, contamination=cont) ilf_FS = IsolationForest(n_estimators=100, contamination=cont) ilf_RPC = IsolationForest(n_estimators=100, contamination=cont) ilf_sys.fit(d_sys) ilf_namenode.fit(d_namenode) ilf_FS.fit(d_FS) ilf_RPC.fit(d_RPC) print "update finished" return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
def transform(self, X, **transform_params): if X.shape[0] < 1/self.contamination: return X self.isolation_forest = IsolationForest(contamination=self.contamination, n_estimators=self.n_estimators, n_jobs=self.n_jobs) to_analyze = self._columns_to_apply(X) if to_analyze is None: to_analyze = self._numeric_columns(X) rest = self._rest_columns(X, to_analyze) self.isolation_forest.fit(to_analyze) labels = self.isolation_forest.predict(to_analyze) to_analyze['_outlier'] = labels; to_analyze = to_analyze[to_analyze['_outlier'] == 1]; del(to_analyze['_outlier']) rest['_outlier'] = labels; rest = rest[rest['_outlier'] == 1]; del(rest['_outlier']) if self.verbose: print('%s Now has %s' % (self.class_name, to_analyze.shape[0])) return pd.concat((to_analyze, rest), axis=1)
def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data # Test max_samples assert_raises(ValueError, IsolationForest(max_samples=-1).fit, X) assert_raises(ValueError, IsolationForest(max_samples=0.0).fit, X) assert_raises(ValueError, IsolationForest(max_samples=2.0).fit, X) # The dataset has less than 256 samples, explicitly setting max_samples > n_samples # should result in a warning. If not set explicitly there should be no warning assert_warns_message(UserWarning, "max_samples will be set to n_samples for estimation", IsolationForest(max_samples=1000).fit, X) assert_no_warnings(IsolationForest(max_samples='auto').fit, X) assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
def test_iforest_performance(): """Test Isolation Forest performs well""" # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) X_train = np.r_[X + 2, X - 2] X_train = X[:100] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X[100:], X_outliers] y_test = np.array([0] * 20 + [1] * 20) # fit the model clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) # predict scores (the lower, the more normal) y_pred = clf.predict(X_test) # check that there is at most 6 errors (false positive or false negative) assert_greater(roc_auc_score(y_test, y_pred), 0.98)
def __init__(self, _id, _config): super(IsolationForest, self).__init__(_id, _config) self._nb_samples = int(_config['nb_samples'])
def get_default_config(): return { 'module': IsolationForest.__name__, 'nb_samples': N_SAMPLES }
def _get_best_detector(self, train): detector = ensemble.IsolationForest() detector.fit(train) return detector
def setUp(self): super(TestIsolationForest, self).setUp() self.if_sml = isolation_forest.IsolationForest( "fakeid", {"module": "fake", "nb_samples": 1000})
def test_learn_structure(self): data = self.get_testing_data() clf = self.if_sml.learn_structure(data) self.assertIsInstance(clf, ensemble.IsolationForest)
def exportPresentationData(classifier,action): dir = input('Give Data Directory: ') if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def exportPresentationData(classifier,action,dir): if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def init(idlist,d,dblack,winsize=50): data = getdata() loadvalue(data, d,dblack) outvalue = extract(d,idlist) print len(outvalue) reshapevalue = np.array(outvalue).reshape(1,-1) window = DataFrame(reshapevalue) buf = []################# while True: print "fetching at %s" %ctime() data = getdata() loadvalue(data, d,dblack) outvalue = extract(d,idlist) reshapevalue = np.array(outvalue).reshape(1,-1) window = window.append(DataFrame(reshapevalue))#??dataframe???1row * xcolums buf.append(DataFrame(reshapevalue)) print len(window) if len(window) > winsize: break sleep(5) ilf = IsolationForest(n_estimators=100,verbose=2,) ilf.fit(window) print ilf.predict(window) print "__________________" for i in buf: print ilf.predict(i) return ilf,window
def updateWindow(buf, cont): ilf = IsolationForest(n_estimators=100, contamination=cont) ilf.fit(buf) # ??buf?????? print "isolation update finished" return ilf
def init(l_sys, l_namenode, l_FS, l_RPC, d, dwhite, winsize=200, sleeptime=15, cont=0.01): # ???? win_sys = [] win_namenode = [] win_FS = [] win_RPC = [] while True: print "fetching at %s" % ctime() data = getdata() loadvalue(data, d, dwhite) o_sys, o_namenode, o_FS, o_RPC = extract(d, l_sys, l_namenode, l_FS, l_RPC) # ?????????? win_sys.append(o_sys) win_namenode.append(o_namenode) win_FS.append(o_FS) win_RPC.append(o_RPC) if len(win_sys) > winsize: # ???????????? break sleep(sleeptime) # ????? ilf_sys = IsolationForest(n_estimators=100, contamination=cont) ilf_namenode = IsolationForest(n_estimators=100, contamination=cont) ilf_FS = IsolationForest(n_estimators=100, contamination=cont) ilf_RPC = IsolationForest(n_estimators=100, contamination=cont) # ??fit ilf_sys.fit(win_sys) ilf_namenode.fit(win_namenode) ilf_FS.fit(win_FS) ilf_RPC.fit(win_RPC) print ilf_sys.predict(win_sys) print ilf_namenode.predict(win_namenode) print ilf_FS.predict(win_FS) print ilf_RPC.predict(win_RPC) # ?????????????? return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC
def updateWindow(buf,cont): ilf = IsolationForest(n_estimators=100,contamination=cont) ilf.fit(buf)#??buf?????? print "isolation update finished" return ilf
def init(l_sys,l_namenode,l_FS,l_RPC,l_queue,d,dwhite,winsize=200,sleeptime = 15,cont=0.01): #???? win_sys = [] win_namenode = [] win_FS = [] win_RPC =[] win_queue = [] while True: print "fetching at %s" %ctime() data = getdata() loadvalue(data, d,dwhite) o_sys,o_namenode,o_FS,o_RPC,o_queue = extract(d,l_sys,l_namenode,l_FS,l_RPC,l_queue) #?????????? win_sys.append(o_sys) win_namenode.append(o_namenode) win_FS.append(o_FS) win_RPC.append(o_RPC) win_queue.append(o_queue) if len(win_sys) > winsize:#???????????? break sleep(sleeptime) #????? ilf_sys = IsolationForest(n_estimators=100,contamination=cont) ilf_namenode = IsolationForest(n_estimators=100,contamination=cont) ilf_FS = IsolationForest(n_estimators=100,contamination=cont) ilf_RPC = IsolationForest(n_estimators=100,contamination=cont) ilf_queue = IsolationForest(n_estimators=100,contamination=cont) #??fit ilf_sys.fit(win_sys) ilf_namenode.fit(win_namenode) ilf_FS.fit(win_FS) ilf_RPC.fit(win_RPC) ilf_queue.fit(win_queue) #?????????????? return ilf_sys,ilf_namenode,ilf_FS,ilf_queue,ilf_RPC
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont,limit): ilf = IsolationForest(n_estimators=100, contamination=cont) client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb') #??? data_sys = sampleWithDecay(client,limit,'select * from ganglia where w_system >0 ORDER BY time DESC limit 1500')#????limit???????? d_sys = data_sys[l_sys] data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC limit 1500') d_FS = data_fs[l_FS] data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC limit 1500') d_namenode = data_namenode[l_namenode] data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC limit 1500') d_RPC = data_rpc[l_RPC] ilf_sys = IsolationForest(n_estimators=100, contamination=cont) ilf_namenode = IsolationForest(n_estimators=100, contamination=cont) ilf_FS = IsolationForest(n_estimators=100, contamination=cont) ilf_RPC = IsolationForest(n_estimators=100, contamination=cont) #????????? ilf_sys.fit(d_sys) ilf_namenode.fit(d_namenode) ilf_FS.fit(d_FS) ilf_RPC.fit(d_RPC) print "update finished" return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont,limit): ilf = IsolationForest(n_estimators=100, contamination=cont) client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb') #??? data_sys = sampleWithDecay(client,limit,'select * from ganglia where w_system >0 ORDER BY time DESC') d_sys = data_sys[l_sys] data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC') d_FS = data_fs[l_FS] data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC') d_namenode = data_namenode[l_namenode] data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC') d_RPC = data_rpc[l_RPC] ilf_sys = IsolationForest(n_estimators=100, contamination=cont) ilf_namenode = IsolationForest(n_estimators=100, contamination=cont) ilf_FS = IsolationForest(n_estimators=100, contamination=cont) ilf_RPC = IsolationForest(n_estimators=100, contamination=cont) #????????? ilf_sys.fit(d_sys) ilf_namenode.fit(d_namenode) ilf_FS.fit(d_FS) ilf_RPC.fit(d_RPC) print "update finished" return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
def init(l_sys, l_namenode, l_FS, l_RPC, sleeptime=15, cont=0.01,limit = 300): # ????? ilf_sys = IsolationForest(n_estimators=100, contamination=cont) ilf_namenode = IsolationForest(n_estimators=100, contamination=cont) ilf_FS = IsolationForest(n_estimators=50, contamination=cont) ilf_RPC = IsolationForest(n_estimators=100, contamination=cont) #?????????? client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb') data_sys = sampleWithDecay(client, limit, 'select * from ganglia where w_system >0 ORDER BY time DESC') d_sys = data_sys[l_sys] data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC') d_FS = data_fs[l_FS] data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC') d_namenode = data_namenode[l_namenode] data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC') d_RPC = data_rpc[l_RPC] print len(d_sys) print len(d_FS) print len(d_namenode) print len(d_RPC) # ??fit ilf_sys.fit(d_sys) ilf_namenode.fit(d_namenode) ilf_FS.fit(d_FS) ilf_RPC.fit(d_RPC) print ilf_FS.predict(d_FS) return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC
def test_iforest(): """Check Isolation Forest for various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) X_test = np.array([[2, 1], [1, 1]]) grid = ParameterGrid({"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}) with ignore_warnings(): for params in grid: IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) assert_array_equal(sparse_results, dense_results)
def test_recalculate_max_depth(): """Check that max_depth is recalculated when max_samples is reset to n_samples""" X = iris.data clf = IsolationForest().fit(X) for est in clf.estimators_: assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))
def test_max_samples_attribute(): X = iris.data clf = IsolationForest().fit(X) assert_equal(clf.max_samples_, X.shape[0]) clf = IsolationForest(max_samples=500) assert_warns_message(UserWarning, "max_samples will be set to n_samples for estimation", clf.fit, X) assert_equal(clf.max_samples_, X.shape[0]) clf = IsolationForest(max_samples=0.4).fit(X) assert_equal(clf.max_samples_, 0.4*X.shape[0])
def test_iforest_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test LOF clf = IsolationForest(random_state=rng) clf.fit(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def isolationForest(self, settings, mname, data): ''' :param settings: -> settings dictionary :param mname: -> name of serialized cluster :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' # rng = np.random.RandomState(42) if settings['random_state'] == 'None': settings['random_state'] = None if isinstance(settings['bootstrap'], str): settings['bootstrap'] = str2Bool(settings['bootstrap']) if isinstance(settings['verbose'], str): settings['verbose'] = str2Bool(settings['verbose']) if settings['max_samples'] != 'auto': settings['max_samples'] = int(settings['max_samples']) # print type(settings['max_samples']) for k, v in settings.iteritems(): logger.info('[%s] : [INFO] IsolationForest %s set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) print "IsolationForest %s set to %s" % (k, v) try: clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'], max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose']) except Exception as inst: logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Error while instanciating isolation forest with %s and %s" % (type(inst), inst.args) sys.exit(1) # clf = IsolationForest(max_samples=100, random_state=rng) # print "*&*&*&& %s" % type(data) try: clf.fit(data) except Exception as inst: logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(1) predict = clf.predict(data) print "Anomaly Array:" print predict self.__serializemodel(clf, 'isoforest', mname) return clf
def detect(self, method, model, data): ''' :param method: -> method name :param model: -> trained clusterer :param data: -> dataframe with data :return: -> dictionary that contains the list of anomalous timestamps ''' smodel = self.__loadClusterModel(method, model) anomalieslist = [] if not smodel: dpredict = 0 else: if data.shape[0]: if isinstance(smodel, IsolationForest): print "Detected IsolationForest model" print "Contamination -> %s" % smodel.contamination print "Max_Features -> %s" % smodel.max_features print "Max_Samples -> %s" % smodel.max_samples_ print "Threashold -> %s " % smodel.threshold_ try: dpredict = smodel.predict(data) print "IsolationForest Prediction Array -> %s" %str(dpredict) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 elif isinstance(smodel, DBSCAN): print "Detected DBSCAN model" print "Leaf_zise -> %s" % smodel.leaf_size print "Algorithm -> %s" % smodel.algorithm print "EPS -> %s" % smodel.eps print "Min_Samples -> %s" % smodel.min_samples print "N_jobs -> %s" % smodel.n_jobs try: dpredict = smodel.fit_predict(data) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 else: dpredict = 0 logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]), str(data.shape[1])) print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]), str(data.shape[1])) print "dpredict type is %s" % (type(dpredict)) if type(dpredict) is not int: anomalyarray = np.argwhere(dpredict == -1) for an in anomalyarray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]]['key']) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key'])) anomalieslist.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomalieslist logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict)) return anomaliesDict
def online_detect(): df = pd.read_csv('ganglia.csv') #??? maxContainSize = 500 window = df[1000:] ilf = IsolationForest(n_estimators=100,verbose=2,) ilf.fit(window) print ilf.predict(window) analomyNum = 0 allanalomy = 0 outcome = [] lable = [] k = 3#???? d = {} buf = [] idlist,namelist = loadname() savename(namelist,idlist) print "initial finished" counter = 1 while True: print "fetching at %s" %ctime() data = getdata() loadvalue(data, d) outvalue = extract(d,idlist) reshapevalue = np.array(outvalue).reshape(1,-1) predictValue = ilf.predict(reshapevalue) print "predict:",predictValue a = int(predictValue) outcome.append(a) lable.append(a) buf.append(DataFrame(reshapevalue))#??dataframe???1row * xcolums if a == -1: analomyNum += 1 allanalomy += 1 #???????? if warn(buf,lable,k): lable[-1] = 1 #???????????? analyseWarn(buf,outcome,k,namelist)#??????? updateWindow(window, buf, maxContainSize) if detectUpdate(buf, 0.87, maxContainSize, analomyNum):#0.087 del ilf window,ilf = updateWindow(window, buf, maxContainSize) analomyNum = 0 del buf buf = [] counter += 1 if counter %5000 ==0: break sleep(15)
def init(l_sys, l_namenode, l_FS, l_RPC, d, dwhite, winsize=200, sleeptime=15, cont=0.01,limit = 300): win_sys = [] win_namenode = [] win_FS = [] win_RPC = [] while True: print "fetching at %s" % ctime() data = getdata() loadvalue(data, d, dwhite) o_sys, o_namenode, o_FS, o_RPC = extract(d, l_sys, l_namenode, l_FS, l_RPC) # ?????????? win_sys.append(o_sys) win_namenode.append(o_namenode) win_FS.append(o_FS) win_RPC.append(o_RPC) if len(win_sys) > winsize: # ???????????? break sleep(sleeptime) # ????? ilf_sys = IsolationForest(n_estimators=100, contamination=cont) ilf_namenode = IsolationForest(n_estimators=100, contamination=cont) ilf_FS = IsolationForest(n_estimators=100, contamination=cont) ilf_RPC = IsolationForest(n_estimators=100, contamination=cont) #?????????? client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb') data_sys = sampleWithDecay(client, limit, 'select * from ganglia where w_system >0 ORDER BY time DESC limit 1500')#?????? d_sys = data_sys[l_sys] data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC limit 1500') d_FS = data_fs[l_FS] data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC limit 1500') d_namenode = data_namenode[l_namenode] data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC limit 1500') d_RPC = data_rpc[l_RPC] #???????? append_sys = pd.DataFrame(win_sys,columns=l_sys) append_namenode = pd.DataFrame(win_namenode, columns=l_namenode) append_FS = pd.DataFrame(win_FS, columns=l_FS) append_RPC = pd.DataFrame(win_RPC, columns=l_RPC) out_sys = pd.concat([d_sys,append_sys]) out_namenode = pd.concat([d_namenode,append_namenode]) out_FS = pd.concat([d_FS,append_FS]) out_RPC = pd.concat([d_RPC,append_RPC]) # ??fit ilf_sys.fit(out_sys) ilf_namenode.fit(out_namenode) ilf_FS.fit(out_FS) ilf_RPC.fit(out_RPC) print ilf_sys.predict(win_sys) print ilf_namenode.predict(win_namenode) print ilf_FS.predict(win_FS) print ilf_RPC.predict(win_RPC) return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC