Tutorials#

Tutorial 1: Predicting Forest Cover Type using PEARL#

Dataset Information:

  • Forest cover type is natural resource inventory information often economically or legally impossible to collect. Vital to any private, state, or federal land management agency.

  • Collected from four wilderness areas located in the Roosevelt National Forest of northern Colorado.

  • Contains variables such as wilderness areas and soil type.

  • Available under the recurrent-data repo.

Code

%matplotlib notebook
from matplotlib import pyplot as plt
from matplotlib import animation
from skika.ensemble import adaptive_random_forest, pearl


class evaluate(object):
    def __init__(self, classifier):
        self.accuracy = 0
        self.num_instances = 0
        self.classifier = classifier

        self.classifier.init_data_source("/your/path/recurrent-data/real-world/covtype.arff");

    def __call__(self):

        correct = 0

        sample_freq = 1000

        for count in range(0, sample_freq):
            if not self.classifier.get_next_instance():
                break

            # test
            prediction = self.classifier.predict()

            actual_label = self.classifier.get_cur_instance_label()
            if prediction == actual_label:
                correct += 1

            # train
            self.classifier.train()

            self.classifier.delete_cur_instance()

        self.accuracy = correct / sample_freq
        self.num_instances += 1000

        return self.num_instances, self.accuracy

num_trees = 60
max_num_candidate_trees = 120
repo_size = 9000
edit_distance_threshold = 90
kappa_window = 50
lossy_window_size = 100000000
reuse_window_size = 0
max_features = -1
bg_kappa_threshold = 0
cd_kappa_threshold = 0.4
reuse_rate_upper_bound = 0.18
warning_delta = 0.0001
drift_delta = 0.00001
enable_state_adaption = True
enable_state_graph = True

arf_classifier = adaptive_random_forest(num_trees,
                                        max_features,
                                        warning_delta,
                                        drift_delta)
arf = evaluate(arf_classifier)

pearl_classifier = pearl(num_trees,
                         max_num_candidate_trees,
                         repo_size,
                         edit_distance_threshold,
                         kappa_window,
                         lossy_window_size,
                         reuse_window_size,
                         max_features,
                         bg_kappa_threshold,
                         cd_kappa_threshold,
                        reuse_rate_upper_bound,
                         warning_delta,
                         drift_delta,
                         enable_state_adaption,
                         enable_state_graph)
pearl = evaluate(pearl_classifier)

fig = plt.figure()

x_arf = []
y_arf = []
x_pearl = []
y_pearl = []

max_samples = 580000

def frames_arf():
    for i in range(max_samples):
        yield arf()

def animate_arf(args):
    x_arf.append(args[0])
    y_arf.append(args[1])
    return plt.plot(x_arf, y_arf, color='C0', linestyle='-', label='ARF')

def frames_pearl():
    for i in range(max_samples):
        yield pearl()

def animate_pearl(args):
    x_pearl.append(args[0])
    y_pearl.append(args[1])
    return plt.plot(x_pearl, y_pearl, color='C1', linestyle='--', label='PEARL')

anim_arf = animation.FuncAnimation(fig, animate_arf, frames=frames_arf, interval=1000)
anim_pearl = animation.FuncAnimation(fig, animate_pearl, frames=frames_pearl, interval=1000)

plt.xlabel('no. instances')
plt.ylabel('accuracy')

F = plt.gcf()
Size = F.get_size_inches()
F.set_size_inches(Size[0]*1.5, Size[1]*1, forward=True)
plt.show()

Result:

_images/covtype-results.svg

Tutorial 2: Error rate comparison between AutoDDM and DDM on synthetic data stream#

Dataset Information:

  • AGRAWAL data stream generator.

Code

from skika.hyper_parameter_tuning.drift_detectors import AutoDDM
from skmultiflow.drift_detection import DDM
from skika.data.reccurring_concept_stream import RCStreamType, RecurringConceptStream, conceptOccurence
import matplotlib.pyplot as plt
import warnings
import numpy as np
import random
import collections
from skmultiflow.trees import HoeffdingTreeClassifier

warnings.filterwarnings('ignore')
plt.style.use("seaborn-whitegrid")

# Global variable
TRAINING_SIZE = 1
STREAM_SIZE = 6000000
grace = 1000
tolerance = 500
DRIFT_INTERVALS = [50000]
concepts = [0, 1, 2]
RANDOMNESS = 100

seed = random.randint(0, 10000)
keys = []
actuals = [0]
concept_chain = {0: 0}
current_concept = 0
for i in range(1, STREAM_SIZE + 1):
    for j in DRIFT_INTERVALS:
        if i % j == 0:
            if i not in keys:
               keys.append(i)
                randomness = random.randint(0, RANDOMNESS)
                d = i + ((randomness * 1) if (random.randint(0, 1) > 0) else (randomness * -1))
                concept_index = random.randint(0, len(concepts) - 1)
                while concepts[concept_index] == current_concept:
                    concept_index = random.randint(0, len(concepts) - 1)
                concept = concepts[concept_index]
                concept_chain[d] = concept
                actuals.append(d)
                current_concept = concept

                i2 = i + 17000
                keys.append(i2)
                randomness = random.randint(0, RANDOMNESS)
                d = i2 + ((randomness * 1) if (random.randint(0, 1) > 0) else (randomness * -1))
                concept_index = random.randint(0, len(concepts) - 1)
                while concepts[concept_index] == current_concept:
                    concept_index = random.randint(0, len(concepts) - 1)
                concept = concepts[concept_index]
                concept_chain[d] = concept
                actuals.append(d)
                current_concept = concept

x = collections.Counter(concept_chain.values())
print(x)

concept_0 = conceptOccurence(id=0, difficulty=6, noise=0,
                             appearences=x[0], examples_per_appearence=max(DRIFT_INTERVALS))
concept_1 = conceptOccurence(id=1, difficulty=6, noise=0,
                             appearences=x[1], examples_per_appearence=max(DRIFT_INTERVALS))
concept_2 = conceptOccurence(id=2, difficulty=6, noise=0,
                             appearences=x[2], examples_per_appearence=max(DRIFT_INTERVALS))
desc = {0: concept_0, 1: concept_1, 2: concept_2}

datastream = RecurringConceptStream(
    rctype=RCStreamType.AGRAWAL,
    num_samples=STREAM_SIZE,
    noise=0,
    concept_chain=concept_chain,
    seed=seed,
    desc=desc,
    boost_first_occurance=False)

X_train = []
y_train = []
for i in range(0, TRAINING_SIZE):
    X, y = datastream.next_sample()
    X_train.append(X[0])
    y_train.append(y[0])

X_train = np.array(X_train)
y_train = np.array(y_train)

ht_auto = HoeffdingTreeClassifier()
ht_auto.partial_fit(X_train, y_train)

ht_ddm = HoeffdingTreeClassifier()
ht_ddm.partial_fit(X_train, y_train)

n_global = TRAINING_SIZE  # Cumulative Number of observations
d_auto = 0
d_ddm = 0
TP_auto= []
TP_ddm= []
FP_auto = []
FP_ddm = []
Delay_auto = []
Delay_ddm = []
grace_end_auto = n_global
grace_end_ddm = n_global
accuracy_auto = 0
accuracy_ddm = 0
acc_x = []
acc_y_auto = []
acc_y_ddm = []

ddm = DDM()
autoDDM = AutoDDM(tolerance=tolerance)
while datastream.has_more_samples():
    n_global += 1
    X_test, y_test = datastream.next_sample()

    if (n_global % 1000 == 0):
        acc_x.append(n_global)
        acc_y_auto.append(1 - (accuracy_auto / n_global))
        acc_y_ddm.append(1 - (accuracy_ddm / n_global))

    y_predict_ddm = ht_ddm.predict(X_test)
    ddm.add_element(y_test != y_predict_ddm)
    accuracy_ddm += 1 if y_test == y_predict_ddm else 0
    if (n_global > grace_end_ddm):
        if ddm.detected_change():
            d_ddm += 1
            drift_point_ddm = key = min(actuals, key=lambda x: abs(x - n_global))
            if (drift_point_ddm != 0 and drift_point_ddm not in TP_ddm and abs(drift_point_ddm - n_global) <= tolerance):
                Delay_ddm.append(abs(n_global - drift_point_ddm))
                TP_ddm.append(drift_point_ddm)
                ht_ddm = HoeffdingTreeClassifier()
                grace_end_ddm = n_global + grace
            else:
                FP_ddm.append(drift_point_ddm)
    ht_ddm.partial_fit(X_test, y_test)

    y_predict_auto = ht_auto.predict(X_test)
    autoDDM.add_element(y_test != y_predict_auto, n_global)
    accuracy_auto += 1 if y_test == y_predict_auto else 0
    if (n_global > grace_end_auto):
        if autoDDM.detected_change():
            d_auto += 1
            drift_point_auto = key = min(actuals, key=lambda x: abs(x - n_global))
            if (drift_point_auto != 0 and drift_point_auto not in TP_auto and abs(drift_point_auto - n_global) <= tolerance):
                Delay_auto.append(abs(n_global - drift_point_auto))
                TP_auto.append(drift_point_auto)
                ht_auto = HoeffdingTreeClassifier()
                autoDDM.detect_TP(n_global)
                grace_end_auto = n_global + grace
            else:
                autoDDM.detect_FP(n_global)
                FP_auto.append(drift_point_auto)
    ht_auto.partial_fit(X_test, y_test)

print("Actual drifts:" + str(len(actuals)))

print("Number of drifts detected by AutoDDM: " + str(d_auto))
print("TP by AutoDDM:" + str(len(TP_auto)))
print("FP by AutoDDM:" + str(len(FP_auto)))
print("Mean Delay by AutoDDM:" + str(np.mean(Delay_auto)))
print("Accuracy by AutoDDM:" + str(accuracy_auto / STREAM_SIZE))

print("Number of drifts detected by DDM: " + str(d_ddm))
print("TP by DDM:" + str(len(TP_ddm)))
print("FP by DDM:" + str(len(FP_ddm)))
print("Mean Delay by DDM:" + str(np.mean(Delay_ddm)))
print("Accuracy by DDM:" + str(accuracy_ddm / STREAM_SIZE))

plt.plot(acc_x, acc_y_ddm, color='black', label='ddm')
plt.plot(acc_x, acc_y_auto, color='red', label='autoDDM')
plt.xlabel("Time")
plt.ylabel("Error Rate")
plt.legend()
plt.show()

Result:

_images/ErrorRate.PNG

Tutorial 3: Predicting Forest Cover Type using NACRE#

Dataset Information:

  • Same as Tutorial 1

Additional Instructions

Code

import copy
from collections import deque
import math
from random import randrange
import time

import numpy as np
from sklearn.metrics import cohen_kappa_score
from skika.nacre import nacre_wrapper

# classification settings
np.random.seed(0)
stream_file_path = "./recurrent-data/real-world/covtype.arff"
sample_freq = 1000

# pearl specific params
num_trees = 60
max_num_candidate_trees = 120
repo_size = 9000
edit_distance_threshold = 90
kappa_window = 50
lossy_window_size = 100000000
reuse_window_size = 0
max_features = -1
bg_kappa_threshold = 0
cd_kappa_threshold = 0.4
reuse_rate_upper_bound = 0.18
warning_delta = 0.0001
drift_delta = 0.00001

poisson_lambda = 6
random_state = 0

# nacre specific params
grpc_port = 50051 # port number for the sequence prediction grpc service
pro_drift_window = 100 # number of instances must be seen for proactive drift adaptation
seq_len = 8 # sequence length for sequence predictor
backtrack_window = 25 # number of instances per eval when backtracking
stability_delta = 0.001
hybrid_delta = 0.001

classifier = nacre_wrapper.nacre_wrapper(
                 seed=0,
                 stream_file_path="/your/path/covtype.arff",
                 num_trees=60,
                 max_num_candidate_trees=120,
                 repo_size=9000,
                 edit_distance_threshold=90,
                 kappa_window=50,
                 lossy_window_size=100000000,
                 reuse_window_size=0,
                 max_features=-1,
                 bg_kappa_threshold=0,
                 cd_kappa_threshold=0.4,
                 reuse_rate_upper_bound=0.18,
                 warning_delta=0.0001,
                 drift_delta=0.00001,
                 poisson_lambda=6,
                 random_state=0,
                 pro_drift_window=100,
                 backtrack_window=25,
                 stability_delta=0.001,
                 hybrid_delta=0.001,
                 seq_len=8)


correct = 0
window_actual_labels = []
window_predicted_labels = []

start_time = time.process_time()

while classifier.get_next_instance():

    # test
    prediction = classifier.predict()

    actual_label = classifier.get_cur_instance_label()
    if prediction == actual_label:
        correct += 1

    window_actual_labels.append(actual_label)
    window_predicted_labels.append(prediction)

    classifier.train()

    # log performance
    if count % sample_freq == 0 and count != 0:
        accuracy = correct / sample_freq
        elapsed_time = time.process_time() - start_time + classifier.cpt_runtime
        kappa = cohen_kappa_score(window_actual_labels, window_predicted_labels)
        candidate_tree_size = classifier.get_candidate_tree_group_size()
        tree_pool_size = classifier.get_tree_pool_size()

        print(f"{count},{accuracy},{kappa},{candidate_tree_size},{tree_pool_size},{str(elapsed_time)}")

        correct = 0
        window_actual_labels = []
        window_predicted_labels = []

Tutorial 4: Transfer Learning with AOTrAdaBoost#

Dataset Information:

  • The bike datasets are real-world datasets available at the transfer-data repo.

  • The task is to classify whether rental bikes are in low or high demand at different times in different cities. We use the configuration of weekdays from Washington D.C. as the source and weekends in London as the target, whereby both source and target datasets contain concept drifts.

Code

from skika.aotradaboost import trans_tree_wrapper

sample_freq = 100
num_trees = 60
rf_lambda = 1
random_state = 0

warning_delta = 0.0001
drift_delta = 0.00001
kappa_window = 60

least_transfer_warning_period_instances_length = 300
instance_store_size = 8000
num_diff_distr_instances = 200
bbt_pool_size = 40
eviction_interval = 1000000
transfer_kappa_threshold = 0.1
transfer_gamma = 8
transfer_match_lowerbound = 0.0
boost_mode = "atradaboost" # i.e. aotradaboost
disable_drift_detection = False


data_file_path = "/your/path/transfer-data/bike/dc-weekend-source.arff;/your/path/transfer-data/bike/weekday.arff";

classifier = trans_tree_wrapper(
    len(data_file_path.split(";")),
    random_state,
    kappa_window,
    warning_delta,
    drift_delta,
    least_transfer_warning_period_instances_length,
    instance_store_size,
    num_diff_distr_instances,
    bbt_pool_size,
    eviction_interval,
    transfer_kappa_threshold,
    transfer_gamma,
    transfer_match_lowerbound,
    boost_mode,
    num_trees,
    disable_drift_detection)

data_file_list = data_file_path.split(";")


prequential_evaluation_transfer(
    classifier=classifier,
    data_file_paths=data_file_list,
    sample_freq=sample_freq,
    expected_accuracies=expected_accuracies)

class ClassifierMetrics:
    def __init__(self):
        self.correct = 0
        self.instance_idx = 0

def prequential_evaluation_transfer(
        classifier,
        data_file_paths,
        sample_freq,
        expected_accuracies):

    classifier_metrics_list = []
    for i in range(len(data_file_paths)):
        classifier.init_data_source(i, data_file_paths[i])
        classifier_metrics_list.append(ClassifierMetrics())

    classifier_idx = 0
    classifier.switch_classifier(classifier_idx)
    metric = classifier_metrics_list[classifier_idx]

    while True:
        if not classifier.get_next_instance():
            # Switch streams to simulate parallel streams

            classifier_idx += 1
            if classifier_idx >= len(data_file_paths):
                break

            classifier.switch_classifier(classifier_idx)
            metric = classifier_metrics_list[classifier_idx]

            print()
            print(f"switching to classifier_idx {classifier_idx}")
            continue

        classifier_metrics_list[classifier_idx].instance_idx += 1

        # test
        prediction = classifier.predict()

        actual_label = classifier.get_cur_instance_label()
        if prediction == actual_label:
            metric.correct += 1

        # train
        classifier.train()

        log_metrics(
            classifier_metrics_list[classifier_idx].instance_idx,
            sample_freq,
            metric,
            classifier)

def log_metrics(count, sample_freq, metric, classifier):
    if count % sample_freq == 0 and count != 0:
        accuracy = round(metric.correct / sample_freq, 2)
        print(f"{accuracy}")

        metric.correct = 0

Tutorial 5: Transfer Learning with OPERA#

Dataset Information:

  • The original Fashion MNIST datasets are used as the source stream, and the target stream is generated with varying similarities by inverting all pixel values of v% of the classes. We denote these configurations as MNIST v% or Fashion MNIST v%, where v% indicates the dissimilarity between the domains. In addition, we apply a 2 by 2 filter on both MNIST and Fashion MNIST datasets for reasonable performance outputs from the random forest base learners with limited tree models.

  • Available at the transfer-data repo.

Code

from skika.transfer import opera_wrapper

sample_freq = 1000
num_trees = 100
rf_lambda = 1
random_state = 0

num_phantom_branches=30
squashing_delta=7
obs_period=1000
conv_delta=0.1
conv_threshold=0.15
obs_window_size=50
perf_window_size=5000
min_obs_period=2000
split_range=10
force_disable_patching=False
force_enable_patching=False
grow_transfer_surrogate_during_obs=False


# datasets separated by the semi-colons
data_file_path = "./transfer-data/fashion-mnist/flip20/source.arff;./transfer-data/fashion-mnist/flip20/target.arff";
data_file_paths = data_file_path.split(";")

classifier = opera_wrapper(
    len(data_file_paths),
    random_state,
    num_trees,
    rf_lambda,
    num_phantom_branches,
    squashing_delta,
    obs_period,
    conv_delta,
    conv_threshold,
    obs_window_size,
    perf_window_size,
    min_obs_period,
    split_range,
    grow_transfer_surrogate_during_obs,
    force_disable_patching,
    force_enable_patching)


class ClassifierMetrics:
    def __init__(self):
        self.correct = 0
        self.instance_idx = 0

def log_metrics(count, sample_freq, metric, classifier):
    if count % sample_freq == 0 and count != 0:
        accuracy = round(metric.correct / sample_freq, 2)

        # Phantom tree outputs
        f = int(classifier.get_full_region_complexity())
        e = int(classifier.get_error_region_complexity())
        c = int(classifier.get_correct_region_complexity())

        print(f"{count},{accuracy},{f},{e},{c}")
        metric.correct = 0


classifier_metrics_list = []
for i in range(len(data_file_paths)):
    classifier.init_data_source(i, data_file_paths[i])
    classifier_metrics_list.append(ClassifierMetrics())

classifier_idx = 0
classifier.switch_classifier(classifier_idx)
metric = classifier_metrics_list[classifier_idx]

while True:
    if not classifier.get_next_instance():
        # Switch streams to simulate parallel streams

        classifier_idx += 1
        if classifier_idx >= len(data_file_paths):
            break

        classifier.switch_classifier(classifier_idx)
        metric = classifier_metrics_list[classifier_idx]

        print()
        print(f"switching to classifier_idx {classifier_idx}")
        continue

    classifier_metrics_list[classifier_idx].instance_idx += 1

    # test
    prediction = classifier.predict()

    actual_label = classifier.get_cur_instance_label()
    if prediction == actual_label:
        metric.correct += 1

    # train
    classifier.train()

    log_metrics(
        classifier_metrics_list[classifier_idx].instance_idx,
        sample_freq,
        metric,
        classifier)