cpe.net:~/workspace $ python
Python 2.7.6 (default, Oct 26 2016, 20:30:19)
[GCC 4.8.4] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> from sklearn.datasets import load_iris

>>>
>>> from sklearn.ensemble import RandomForestClassifier
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> iris = load_iris()
>>> df = pd.DataFrame(iris.data, columns=iris.feature_names)
>>> df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
>>> df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
>>> df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
>>> df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
>>> df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species is_train
0 5.1 3.5 1.4 0.2 setosa True
1 4.9 3.0 1.4 0.2 setosa True
2 4.7 3.2 1.3 0.2 setosa True
3 4.6 3.1 1.5 0.2 setosa True
4 5.0 3.6 1.4 0.2 setosa True
>>> train, test = df[df['is_train']==True], df[df['is_train']==False]
>>> print('Number of observations in the training data:', len(train))
('Number of observations in the training data:', 118)
>>> print('Number of observations in the training data:', len(test))
('Number of observations in the training data:', 32)
>>> features = df.columns[:4]
>>>
>>> features
Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
u'petal width (cm)'],
dtype='object')
>>> y = pd.factorize(train['species'])[0]
>>> y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2])
>>> clf = RandomForestClassifier(n_jobs=2, random_state=0)
>>> clf.fit(train[features], y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
oob_score=False, random_state=0, verbose=0, warm_start=False)
>>> clf.predict(test[features])
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
>>> clf.predict_proba(test[features])[0:10]
array([[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.9, 0.1, 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ]])
>>> preds = iris.target_names[clf.predict(test[features])]
>>> preds[0:5]
array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='|S10')
>>> test['species'].head()
7 setosa
8 setosa
10 setosa
13 setosa
17 setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]
>>> pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
Predicted Species setosa versicolor virginica
Actual Species
setosa 13 0 0
versicolor 0 5 2
virginica 0 0 12
>>> list(zip(train[features], clf.feature_importances_))
[('sepal length (cm)', '0.11185992930506346'), ('sepal width (cm)', '0.016341813006098178'), ('petal length (cm)', '0.36439533040889194'), ('petal width (cm)', '0.5074029272799464')]
>>>
>>>