#!/usr/bin/env python# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance# kmeanssample 2 pass, first sample sqrt(N)from __future__ import divisionimport randomimport numpy as npfrom scipy.spatial.distance import cdist # $scipy/spatial/distance.py# http://docs.scipy.org/doc/scipy/reference/spatial.htmlfrom scipy.sparse import issparse # $scipy/sparse/csr.py
__date__ ="2011-11-17 Nov denis"# X sparse, any cdist metric: real app ?# centres get dense rapidly, metrics in high dim hit distance whiteout# vs unsupervised / semi-supervised svm#...............................................................................def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1):""" centres, Xtocentre, distances = kmeans( X, initial centres ... )
in:
X N x dim may be sparse
centres k x dim: initial centres, e.g. random.sample( X, k )
delta: relative error, iterate until the average distance to centres
is within delta of the previous average distance
maxiter
metric: any of the 20-odd in scipy.spatial.distance
"chebyshev" = max, "cityblock" = L1, "minkowski" with p=
or a function( Xvec, centrevec ), e.g. Lqmetric below
p: for minkowski metric -- local mod cdist for 0 < p < 1 too
verbose: 0 silent, 2 prints running distances
out:
centres, k x dim
Xtocentre: each X -> its nearest centre, ints N -> k
distances, N
see also: kmeanssample below, class Kmeans below.
"""ifnot issparse(X):
X = np.asanyarray(X)# ?
centres = centres.todense()if issparse(centres) \else centres.copy()
N, dim = X.shape
k, cdim = centres.shapeif dim != cdim:raiseValueError("kmeans: X %s and centres %s must have the same number of columns"%(
X.shape, centres.shape ))if verbose:print"kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s"%(
X.shape, centres.shape, delta, maxiter, metric)
allx = np.arange(N)
prevdist =0for jiter in range(1, maxiter+1):
D = cdist_sparse( X, centres, metric=metric, p=p )# |X| x |centres|
xtoc = D.argmin(axis=1)# X -> nearest centre
distances = D[allx,xtoc]
avdist = distances.mean()# median ?if verbose >=2:print"kmeans: av |X - nearest centre| = %.4g"% avdistif(1- delta)* prevdist <= avdist <= prevdist \or jiter == maxiter:break
prevdist = avdistfor jc in range(k):# (1 pass in C)
c = np.where( xtoc == jc )[0]if len(c)>0:
centres[jc]= X[c].mean( axis=0)if verbose:print"kmeans: %d iterations cluster sizes:"% jiter, np.bincount(xtoc)if verbose >=2:
r50 = np.zeros(k)
r90 = np.zeros(k)for j in range(k):
dist = distances[ xtoc == j ]if len(dist)>0:
r50[j], r90[j]= np.percentile( dist,(50,90))print"kmeans: cluster 50 % radius", r50.astype(int)print"kmeans: cluster 90 % radius", r90.astype(int)# scale L1 / dim, L2 / sqrt(dim) ?return centres, xtoc, distances#...............................................................................def kmeanssample( X, k, nsample=0,**kwargs ):""" 2-pass kmeans, fast for large N:
1) kmeans a random sample of nsample ~ sqrt(N) from X
2) full kmeans, starting from those centres
"""# merge w kmeans ? mttiw# v large N: sample N^1/2, N^1/2 of that# seed like sklearn ?
N, dim = X.shapeif nsample ==0:
nsample = max(2*np.sqrt(N),10*k )Xsample= randomsample( X, int(nsample))
pass1centres = randomsample( X, int(k))
samplecentres = kmeans(Xsample, pass1centres,**kwargs )[0]return kmeans( X, samplecentres,**kwargs )def cdist_sparse( X, Y,**kwargs ):""" -> |X| x |Y| cdist array, any cdist metric
X or Y may be sparse -- best csr
"""# todense row at a time, v slow if both v sparse
sxy =2*issparse(X)+ issparse(Y)if sxy ==0:return cdist( X, Y,**kwargs )
d = np.empty((X.shape[0], Y.shape[0]), np.float64 )if sxy ==2:for j, x in enumerate(X):
d[j]= cdist( x.todense(), Y,**kwargs )[0]elif sxy ==1:for k, y in enumerate(Y):
d[:,k]= cdist( X, y.todense(),**kwargs )[0]else:for j, x in enumerate(X):for k, y in enumerate(Y):
d[j,k]= cdist( x.todense(), y.todense(),**kwargs )[0]return ddef randomsample( X, n ):""" random.sample of the rows of X
X may be sparse -- best csr
"""
sampleix = random.sample( xrange( X.shape[0]), int(n))return X[sampleix]def nearestcentres( X, centres, metric="euclidean", p=2):""" each X -> nearest centre, any metric
euclidean2 (~ withinss) is more sensitive to outliers,
cityblock (manhattan, L1) less sensitive
"""
D = cdist( X, centres, metric=metric, p=p )# |X| x |centres|return D.argmin(axis=1)defLqmetric( x, y=None, q=.5):# yes a metric, may increase weight of near matches; see ...return(np.abs(x - y)** q).mean()if y isnotNone \else(np.abs(x)** q).mean()#...............................................................................classKmeans:""" km = Kmeans( X, k= or centres=, ... )
in: either initial centres= for kmeans
or k= [nsample=] for kmeanssample
out: km.centres, km.Xtocentre, km.distances
iterator:
for jcentre, J in km:
clustercentre = centres[jcentre]
J indexes e.g. X[J], classes[J]
"""def __init__( self, X, k=0, centres=None, nsample=0,**kwargs ):
self.X = Xif centres isNone:
self.centres, self.Xtocentre, self.distances = kmeanssample(
X, k=k, nsample=nsample,**kwargs )else:
self.centres, self.Xtocentre, self.distances = kmeans(
X, centres,**kwargs )def __iter__(self):for jc in range(len(self.centres)):yield jc,(self.Xtocentre== jc)#...............................................................................if __name__ =="__main__":import randomimport sysfrom time import time
N =10000
dim =10
ncluster =10
kmsample =100# 0: random centres, > 0: kmeanssample
kmdelta =.001
kmiter =10
metric ="cityblock"# "chebyshev" = max, "cityblock" L1, Lqmetric
seed =1exec("\n".join( sys.argv[1:]))# run this.py N= ...
np.set_printoptions(1, threshold=200, edgeitems=5, suppress=True)
np.random.seed(seed)
random.seed(seed)print"N %d dim %d ncluster %d kmsample %d metric %s"%(
N, dim, ncluster, kmsample, metric)
X = np.random.exponential( size=(N,dim))# cf scikits-learn datasets/
t0 = time()if kmsample >0:
centres, xtoc, dist = kmeanssample( X, ncluster, nsample=kmsample,
delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2)else:
randomcentres = randomsample( X, ncluster )
centres, xtoc, dist = kmeans( X, randomcentres,
delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2)print"%.0f msec"%((time()- t0)*1000)# also ~/py/np/kmeans/test-kmeans.py
2012年3月26日添加了一些注意事项:
1)对于余弦距离,首先将所有数据向量归一化为| X | = 1; 然后
cosinedistance( X, Y )=1- X . Y =Euclidean distance |X - Y|^2/2
Here’s a small kmeans that uses any of the 20-odd distances in
scipy.spatial.distance, or a user function.
Comments would be welcome (this has had only one user so far, not enough);
in particular, what are your N, dim, k, metric ?
#!/usr/bin/env python
# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance
# kmeanssample 2 pass, first sample sqrt(N)
from __future__ import division
import random
import numpy as np
from scipy.spatial.distance import cdist # $scipy/spatial/distance.py
# http://docs.scipy.org/doc/scipy/reference/spatial.html
from scipy.sparse import issparse # $scipy/sparse/csr.py
__date__ = "2011-11-17 Nov denis"
# X sparse, any cdist metric: real app ?
# centres get dense rapidly, metrics in high dim hit distance whiteout
# vs unsupervised / semi-supervised svm
#...............................................................................
def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ):
""" centres, Xtocentre, distances = kmeans( X, initial centres ... )
in:
X N x dim may be sparse
centres k x dim: initial centres, e.g. random.sample( X, k )
delta: relative error, iterate until the average distance to centres
is within delta of the previous average distance
maxiter
metric: any of the 20-odd in scipy.spatial.distance
"chebyshev" = max, "cityblock" = L1, "minkowski" with p=
or a function( Xvec, centrevec ), e.g. Lqmetric below
p: for minkowski metric -- local mod cdist for 0 < p < 1 too
verbose: 0 silent, 2 prints running distances
out:
centres, k x dim
Xtocentre: each X -> its nearest centre, ints N -> k
distances, N
see also: kmeanssample below, class Kmeans below.
"""
if not issparse(X):
X = np.asanyarray(X) # ?
centres = centres.todense() if issparse(centres) \
else centres.copy()
N, dim = X.shape
k, cdim = centres.shape
if dim != cdim:
raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % (
X.shape, centres.shape ))
if verbose:
print "kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % (
X.shape, centres.shape, delta, maxiter, metric)
allx = np.arange(N)
prevdist = 0
for jiter in range( 1, maxiter+1 ):
D = cdist_sparse( X, centres, metric=metric, p=p ) # |X| x |centres|
xtoc = D.argmin(axis=1) # X -> nearest centre
distances = D[allx,xtoc]
avdist = distances.mean() # median ?
if verbose >= 2:
print "kmeans: av |X - nearest centre| = %.4g" % avdist
if (1 - delta) * prevdist <= avdist <= prevdist \
or jiter == maxiter:
break
prevdist = avdist
for jc in range(k): # (1 pass in C)
c = np.where( xtoc == jc )[0]
if len(c) > 0:
centres[jc] = X[c].mean( axis=0 )
if verbose:
print "kmeans: %d iterations cluster sizes:" % jiter, np.bincount(xtoc)
if verbose >= 2:
r50 = np.zeros(k)
r90 = np.zeros(k)
for j in range(k):
dist = distances[ xtoc == j ]
if len(dist) > 0:
r50[j], r90[j] = np.percentile( dist, (50, 90) )
print "kmeans: cluster 50 % radius", r50.astype(int)
print "kmeans: cluster 90 % radius", r90.astype(int)
# scale L1 / dim, L2 / sqrt(dim) ?
return centres, xtoc, distances
#...............................................................................
def kmeanssample( X, k, nsample=0, **kwargs ):
""" 2-pass kmeans, fast for large N:
1) kmeans a random sample of nsample ~ sqrt(N) from X
2) full kmeans, starting from those centres
"""
# merge w kmeans ? mttiw
# v large N: sample N^1/2, N^1/2 of that
# seed like sklearn ?
N, dim = X.shape
if nsample == 0:
nsample = max( 2*np.sqrt(N), 10*k )
Xsample = randomsample( X, int(nsample) )
pass1centres = randomsample( X, int(k) )
samplecentres = kmeans( Xsample, pass1centres, **kwargs )[0]
return kmeans( X, samplecentres, **kwargs )
def cdist_sparse( X, Y, **kwargs ):
""" -> |X| x |Y| cdist array, any cdist metric
X or Y may be sparse -- best csr
"""
# todense row at a time, v slow if both v sparse
sxy = 2*issparse(X) + issparse(Y)
if sxy == 0:
return cdist( X, Y, **kwargs )
d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
if sxy == 2:
for j, x in enumerate(X):
d[j] = cdist( x.todense(), Y, **kwargs ) [0]
elif sxy == 1:
for k, y in enumerate(Y):
d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
else:
for j, x in enumerate(X):
for k, y in enumerate(Y):
d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
return d
def randomsample( X, n ):
""" random.sample of the rows of X
X may be sparse -- best csr
"""
sampleix = random.sample( xrange( X.shape[0] ), int(n) )
return X[sampleix]
def nearestcentres( X, centres, metric="euclidean", p=2 ):
""" each X -> nearest centre, any metric
euclidean2 (~ withinss) is more sensitive to outliers,
cityblock (manhattan, L1) less sensitive
"""
D = cdist( X, centres, metric=metric, p=p ) # |X| x |centres|
return D.argmin(axis=1)
def Lqmetric( x, y=None, q=.5 ):
# yes a metric, may increase weight of near matches; see ...
return (np.abs(x - y) ** q) .mean() if y is not None \
else (np.abs(x) ** q) .mean()
#...............................................................................
class Kmeans:
""" km = Kmeans( X, k= or centres=, ... )
in: either initial centres= for kmeans
or k= [nsample=] for kmeanssample
out: km.centres, km.Xtocentre, km.distances
iterator:
for jcentre, J in km:
clustercentre = centres[jcentre]
J indexes e.g. X[J], classes[J]
"""
def __init__( self, X, k=0, centres=None, nsample=0, **kwargs ):
self.X = X
if centres is None:
self.centres, self.Xtocentre, self.distances = kmeanssample(
X, k=k, nsample=nsample, **kwargs )
else:
self.centres, self.Xtocentre, self.distances = kmeans(
X, centres, **kwargs )
def __iter__(self):
for jc in range(len(self.centres)):
yield jc, (self.Xtocentre == jc)
#...............................................................................
if __name__ == "__main__":
import random
import sys
from time import time
N = 10000
dim = 10
ncluster = 10
kmsample = 100 # 0: random centres, > 0: kmeanssample
kmdelta = .001
kmiter = 10
metric = "cityblock" # "chebyshev" = max, "cityblock" L1, Lqmetric
seed = 1
exec( "\n".join( sys.argv[1:] )) # run this.py N= ...
np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True )
np.random.seed(seed)
random.seed(seed)
print "N %d dim %d ncluster %d kmsample %d metric %s" % (
N, dim, ncluster, kmsample, metric)
X = np.random.exponential( size=(N,dim) )
# cf scikits-learn datasets/
t0 = time()
if kmsample > 0:
centres, xtoc, dist = kmeanssample( X, ncluster, nsample=kmsample,
delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )
else:
randomcentres = randomsample( X, ncluster )
centres, xtoc, dist = kmeans( X, randomcentres,
delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )
print "%.0f msec" % ((time() - t0) * 1000)
# also ~/py/np/kmeans/test-kmeans.py
Some notes added 26mar 2012:
1) for cosine distance, first normalize all the data vectors to |X| = 1; then
cosinedistance( X, Y ) = 1 - X . Y = Euclidean distance |X - Y|^2 / 2
is fast. For bit vectors, keep the norms separately from the vectors
instead of expanding out to floats
(although some programs may expand for you).
For sparse vectors, say 1 % of N, X . Y should take time O( 2 % N ),
space O(N); but I don’t know which programs do that.
2)
Scikit-learn clustering
gives an excellent overview of k-means, mini-batch-k-means …
with code that works on scipy.sparse matrices.
3) Always check cluster sizes after k-means.
If you’re expecting roughly equal-sized clusters, but they come out
[44 37 9 5 5] % … (sound of head-scratching).
Unfortunately no: scikit-learn current implementation of k-means only uses Euclidean distances.
It is not trivial to extend k-means to other distances and denis’ answer above is not the correct way to implement k-means for other metrics.
回答 2
只需在可以执行此操作的地方使用nltk即可,例如
from nltk.cluster.kmeans importKMeansClusterer
NUM_CLUSTERS =<choose a value>
data =<sparse matrix that you would normally give to scikit>.toarray()
kclusterer =KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)
from nltk.cluster.kmeans import KMeansClusterer
NUM_CLUSTERS = <choose a value>
data = <sparse matrix that you would normally give to scikit>.toarray()
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)
Yes you can use a difference metric function; however, by definition, the k-means clustering algorithm relies on the eucldiean distance from the mean of each cluster.
You could use a different metric, so even though you are still calculating the mean you could use something like the mahalnobis distance.
Sklearn Kmeans uses the Euclidean distance. It has no metric parameter. This said, if you’re clustering time series, you can use the tslearn python package, when you can specify a metric (dtw, softdtw, euclidean).
I have a machine learning classification problem with 80% categorical variables. Must I use one hot encoding if I want to use some classifier for the classification? Can i pass the data to a classifier without the encoding?
I am trying to do the following for feature selection:
I change the type of the categorical features to ‘category’:
non_categorial_features = ['orig_destination_distance',
'srch_adults_cnt',
'srch_children_cnt',
'srch_rm_cnt',
'cnt']
for categorical_feature in list(train_small.columns):
if categorical_feature not in non_categorial_features:
train_small[categorical_feature] = train_small[categorical_feature].astype('category')
The problem is that the 3’rd part often get stuck, although I am using a strong machine.
Thus, without the one hot encoding I can’t do any feature selection, for determining the importance of the features.
What do you recommend?
回答 0
方法1:您可以在pandas数据框上使用get_dummies。
范例1:
import pandas as pd
s = pd.Series(list('abca'))
pd.get_dummies(s)Out[]:
a b c01.00.00.010.01.00.020.00.01.031.00.00.0
范例2:
下面将把给定的列转换为一个热门列。使用前缀具有多个虚拟变量。
import pandas as pd
df = pd.DataFrame({'A':['a','b','a'],'B':['b','a','c']})
dfOut[]:
A B0 a b1 b a2 a c# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['B'])# Drop column B as it is now encoded
df = df.drop('B',axis =1)# Join the encoded df
df = df.join(one_hot)
df Out[]:
A a b c0 a 0101 b 1002 a 001
import pandas as pd
s = pd.Series(list('abca'))
pd.get_dummies(s)
Out[]:
a b c
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
3 1.0 0.0 0.0
Example 2:
The following will transform a given column into one hot. Use prefix to have multiple dummies.
import pandas as pd
df = pd.DataFrame({
'A':['a','b','a'],
'B':['b','a','c']
})
df
Out[]:
A B
0 a b
1 b a
2 a c
# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['B'])
# Drop column B as it is now encoded
df = df.drop('B',axis = 1)
# Join the encoded df
df = df.join(one_hot)
df
Out[]:
A a b c
0 a 0 1 0
1 b 1 0 0
2 a 0 0 1
Approach 2: Use Scikit-learn
Using a OneHotEncoder has the advantage of being able to fit on some training data and then transform on some other data using the same instance. We also have handle_unknown to further control what the encoder does with unseen data.
Given a dataset with three features and four samples, we let the encoder find the maximum value per feature and transform the data to a binary one-hot encoding.
Much easier to use Pandas for basic one-hot encoding. If you’re looking for more options you can use scikit-learn.
For basic one-hot encoding with Pandas you pass your data frame into the get_dummies function.
For example, if I have a dataframe called imdb_movies:
…and I want to one-hot encode the Rated column, I do this:
pd.get_dummies(imdb_movies.Rated)
This returns a new dataframe with a column for every “level” of rating that exists, along with either a 1 or 0 specifying the presence of that rating for a given observation.
Usually, we want this to be part of the original dataframe. In this case, we attach our new dummy coded frame onto the original frame using “column-binding.
We can column-bind by using Pandas concat function:
import numpy as np
nb_classes =6
data =[[2,3,4,0]]def indices_to_one_hot(data, nb_classes):"""Convert an iterable of indices to one-hot encoded labels."""
targets = np.array(data).reshape(-1)return np.eye(nb_classes)[targets]
You can do it with numpy.eye and a using the array element selection mechanism:
import numpy as np
nb_classes = 6
data = [[2, 3, 4, 0]]
def indices_to_one_hot(data, nb_classes):
"""Convert an iterable of indices to one-hot encoded labels."""
targets = np.array(data).reshape(-1)
return np.eye(nb_classes)[targets]
The the return value of indices_to_one_hot(nb_classes, data) is now
from sklearn.preprocessing importLabelEncoder#Auto encodes any dataframe column of type category or object.def dummyEncode(df):
columnsToEncode = list(df.select_dtypes(include=['category','object']))
le =LabelEncoder()for feature in columnsToEncode:try:
df[feature]= le.fit_transform(df[feature])except:print('Error encoding '+feature)return df
编辑:比较要更清楚:
一键编码:将n个级别转换为n-1列。
IndexAnimalIndex cat mouse
1 dog 1002 cat -->2103 mouse 301
如果分类功能中有许多不同的类型(或级别),则可以看到这将如何扩展您的内存。请记住,这只是一栏。
虚拟编码:
IndexAnimalIndexAnimal1 dog 102 cat -->213 mouse 32
Lastly, is it necessary for you to one hot encode? One hot encoding exponentially increases the number of features, drastically increasing the run time of any classifier or anything else you are going to run. Especially when each categorical feature has many levels. Instead you can do dummy coding.
Using dummy encoding usually works well, for much less run time and complexity. A wise prof once told me, ‘Less is More’.
Here’s the code for my custom encoding function if you want.
from sklearn.preprocessing import LabelEncoder
#Auto encodes any dataframe column of type category or object.
def dummyEncode(df):
columnsToEncode = list(df.select_dtypes(include=['category','object']))
le = LabelEncoder()
for feature in columnsToEncode:
try:
df[feature] = le.fit_transform(df[feature])
except:
print('Error encoding '+feature)
return df
EDIT: Comparison to be clearer:
One-hot encoding: convert n levels to n-1 columns.
Index Animal Index cat mouse
1 dog 1 0 0
2 cat --> 2 1 0
3 mouse 3 0 1
You can see how this will explode your memory if you have many different types (or levels) in your categorical feature. Keep in mind, this is just ONE column.
Dummy Coding:
Index Animal Index Animal
1 dog 1 0
2 cat --> 2 1
3 mouse 3 2
Convert to numerical representations instead. Greatly saves feature space, at the cost of a bit of accuracy.
回答 4
使用熊猫进行热编码非常简单:
def one_hot(df, cols):"""
@param df pandas DataFrame
@param cols a list of columns to encode
@return a DataFrame with one-hot encoding
"""for each in cols:
dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
df = pd.concat([df, dummies], axis=1)return df
编辑:
使用sklearn的另一种方式one_hot LabelBinarizer:
from sklearn.preprocessing importLabelBinarizer
label_binarizer =LabelBinarizer()
label_binarizer.fit(all_your_labels_list)# need to be global or remembered to use it laterdef one_hot_encode(x):"""
One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
: x: List of sample Labels
: return: Numpy array of one-hot encoded labels
"""return label_binarizer.transform(x)
def one_hot(df, cols):
"""
@param df pandas DataFrame
@param cols a list of columns to encode
@return a DataFrame with one-hot encoding
"""
for each in cols:
dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
df = pd.concat([df, dummies], axis=1)
return df
EDIT:
Another way to one_hot using sklearn’s LabelBinarizer :
from sklearn.preprocessing import LabelBinarizer
label_binarizer = LabelBinarizer()
label_binarizer.fit(all_your_labels_list) # need to be global or remembered to use it later
def one_hot_encode(x):
"""
One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
: x: List of sample Labels
: return: Numpy array of one-hot encoded labels
"""
return label_binarizer.transform(x)
回答 5
您可以使用numpy.eye函数。
import numpy as np
def one_hot_encode(x, n_classes):"""
One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
: x: List of sample Labels
: return: Numpy array of one-hot encoded labels
"""return np.eye(n_classes)[x]def main():
list =[0,1,2,3,4,3,2,1,0]
n_classes =5
one_hot_list = one_hot_encode(list, n_classes)print(one_hot_list)if __name__ =="__main__":
main()
import numpy as np
def one_hot_encode(x, n_classes):
"""
One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
: x: List of sample Labels
: return: Numpy array of one-hot encoded labels
"""
return np.eye(n_classes)[x]
def main():
list = [0,1,2,3,4,3,2,1,0]
n_classes = 5
one_hot_list = one_hot_encode(list, n_classes)
print(one_hot_list)
if __name__ == "__main__":
main()
One-hot encoding requires bit more than converting the values to indicator variables. Typically ML process requires you to apply this coding several times to validation or test data sets and applying the model you construct to real-time observed data. You should store the mapping (transform) that was used to construct the model. A good solution would use the DictVectorizer or LabelEncoder (followed by get_dummies. Here is a function that you can use:
This works on a pandas dataframe and for each column of the dataframe it creates and returns a mapping back. So you would call it like this:
train_data, le_dict = oneHotEncode2(train_data)
Then on the test data, the call is made by passing the dictionary returned back from training:
test_data, _ = oneHotEncode2(test_data, le_dict)
An equivalent method is to use DictVectorizer. A related post on the same is on my blog. I mention it here since it provides some reasoning behind this approach over simply using get_dummies post (disclosure: this is my own blog).
You can pass the data to catboost classifier without encoding. Catboost handles categorical variables itself by performing one-hot and target expanding mean encoding.
回答 10
您也可以执行以下操作。请注意以下内容,您不必使用pd.concat。
import pandas as pd
# intialise data of lists.
data ={'Color':['Red','Yellow','Red','Yellow'],'Length':[20.1,21.1,19.1,18.1],'Group':[1,2,1,2]}# Create DataFrame
df = pd.DataFrame(data)for _c in df.select_dtypes(include=['object']).columns:print(_c)
df[_c]= pd.Categorical(df[_c])
df_transformed = pd.get_dummies(df)
df_transformed
您还可以将显式列更改为分类。例如,在这里我要更改Color和Group
import pandas as pd
# intialise data of lists.
data ={'Color':['Red','Yellow','Red','Yellow'],'Length':[20.1,21.1,19.1,18.1],'Group':[1,2,1,2]}# Create DataFrame
df = pd.DataFrame(data)
columns_to_change = list(df.select_dtypes(include=['object']).columns)
columns_to_change.append('Group')for _c in columns_to_change:print(_c)
df[_c]= pd.Categorical(df[_c])
df_transformed = pd.get_dummies(df)
df_transformed
classOneHotEncoder:def __init__(self,optionKeys):
length=len(optionKeys)
self.__dict__={optionKeys[j]:[0if i!=j else1for i in range(length)]for j in range(length)}
class OneHotEncoder:
def __init__(self,optionKeys):
length=len(optionKeys)
self.__dict__={optionKeys[j]:[0 if i!=j else 1 for i in range(length)] for j in range(length)}
def one_hot_encode(y):"""Convert an iterable of indices to one-hot encoded labels."""
y = y.flatten()# Sometimes not flattened vector is passed e.g (118,1) in these cases# the function ends up creating a tensor e.g. (118, 2, 1). flatten removes this issue
nb_classes = len(np.unique(y))# get the number of unique classes
standardised_labels = dict(zip(np.unique(y), np.arange(nb_classes)))# get the class labels as a dictionary# which then is standardised. E.g imagine class labels are (4,7,9) if a vector of y containing 4,7 and 9 is# directly passed then np.eye(nb_classes)[4] or 7,9 throws an out of index error.# standardised labels fixes this issue by returning a dictionary;# standardised_labels = {4:0, 7:1, 9:2}. The values of the dictionary are mapped to keys in y array.# standardised_labels also removes the error that is raised if the labels are floats. E.g. 1.0; element# cannot be called by an integer index e.g y[1.0] - throws an index error.
targets = np.vectorize(standardised_labels.get)(y)# map the dictionary values to array.return np.eye(nb_classes)[targets]
def one_hot_encode(y):
"""Convert an iterable of indices to one-hot encoded labels."""
y = y.flatten() # Sometimes not flattened vector is passed e.g (118,1) in these cases
# the function ends up creating a tensor e.g. (118, 2, 1). flatten removes this issue
nb_classes = len(np.unique(y)) # get the number of unique classes
standardised_labels = dict(zip(np.unique(y), np.arange(nb_classes))) # get the class labels as a dictionary
# which then is standardised. E.g imagine class labels are (4,7,9) if a vector of y containing 4,7 and 9 is
# directly passed then np.eye(nb_classes)[4] or 7,9 throws an out of index error.
# standardised labels fixes this issue by returning a dictionary;
# standardised_labels = {4:0, 7:1, 9:2}. The values of the dictionary are mapped to keys in y array.
# standardised_labels also removes the error that is raised if the labels are floats. E.g. 1.0; element
# cannot be called by an integer index e.g y[1.0] - throws an index error.
targets = np.vectorize(standardised_labels.get)(y) # map the dictionary values to array.
return np.eye(nb_classes)[targets]
import typing
def one_hot_encode(items: list)-> typing.List[list]:
results =[]# find the unique items (we want to unique items b/c duplicate items will have the same encoding)
unique_items = list(set(items))# sort the unique items
sorted_items = sorted(unique_items)# find how long the list of each item should be
max_index = len(unique_items)for item in items:# create a list of zeros the appropriate length
one_hot_encoded_result =[0for i in range(0, max_index)]# find the index of the item
one_hot_index = sorted_items.index(item)# change the zero at the index from the previous line to a one
one_hot_encoded_result[one_hot_index]=1# add the result
results.append(one_hot_encoded_result)return results
Here is a function to do one-hot-encoding without using numpy, pandas, or other packages. It takes a list of integers, booleans, or strings (and perhaps other types too).
import typing
def one_hot_encode(items: list) -> typing.List[list]:
results = []
# find the unique items (we want to unique items b/c duplicate items will have the same encoding)
unique_items = list(set(items))
# sort the unique items
sorted_items = sorted(unique_items)
# find how long the list of each item should be
max_index = len(unique_items)
for item in items:
# create a list of zeros the appropriate length
one_hot_encoded_result = [0 for i in range(0, max_index)]
# find the index of the item
one_hot_index = sorted_items.index(item)
# change the zero at the index from the previous line to a one
one_hot_encoded_result[one_hot_index] = 1
# add the result
results.append(one_hot_encoded_result)
return results
I know there are already a lot of answers to this question, but I noticed two things. First, most of the answers use packages like numpy and/or pandas. And this is a good thing. If you are writing production code, you should probably be using robust, fast algorithms like those provided in the numpy/pandas packages. But, for the sake of education, I think someone should provide an answer which has a transparent algorithm and not just an implementation of someone else’s algorithm. Second, I noticed that many of the answers do not provide a robust implementation of one-hot encoding because they do not meet one of the requirements below. Below are some of the requirements (as I see them) for a useful, accurate, and robust one-hot encoding function:
A one-hot encoding function must:
handle list of various types (e.g. integers, strings, floats, etc.) as input
handle an input list with duplicates
return a list of lists corresponding (in the same order as) to the inputs
return a list of lists where each list is as short as possible
I tested many of the answers to this question and most of them fail on one of the requirements above.
回答 18
试试这个:
!pip install category_encoders
import category_encoders as ce
categorical_columns =[...the list of names of the columns you want to one-hot-encode ...]
encoder = ce.OneHotEncoder(cols=categorical_columns, use_cat_names=True)
df_train_encoded = encoder.fit_transform(df_train_small)
!pip install category_encoders
import category_encoders as ce
categorical_columns = [...the list of names of the columns you want to one-hot-encode ...]
encoder = ce.OneHotEncoder(cols=categorical_columns, use_cat_names=True)
df_train_encoded = encoder.fit_transform(df_train_small)
df_encoded.head()
The resulting dataframe df_train_encoded is the same as the original, but the categorical features are now replaced with their one-hot-encoded versions.
vect =CountVectorizer()
tfidf =TfidfTransformer()
clf =SGDClassifier()
vX = vect.fit_transform(Xtrain)
tfidfX = tfidf.fit_transform(vX)
predicted = clf.fit_predict(tfidfX)# Now evaluate all steps on test set
vX = vect.fit_transform(Xtest)
tfidfX = tfidf.fit_transform(vX)
predicted = clf.fit_predict(tfidfX)
只是:
pipeline =Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier()),])
predicted = pipeline.fit(Xtrain).predict(Xtrain)# Now evaluate all steps on test set
predicted = pipeline.predict(Xtest)
Transformer in scikit-learn – some class that have fit and transform method, or fit_transform method.
Predictor – some class that has fit and predict methods, or fit_predict method.
Pipeline is just an abstract notion, it’s not some existing ml algorithm. Often in ML tasks you need to perform sequence of different transformations (find set of features, generate new features, select only some good features) of raw dataset before applying final estimator.
Here is a good example of Pipeline usage.
Pipeline gives you a single interface for all 3 steps of transformation and resulting estimator. It encapsulates transformers and predictors inside, and now you can do something like:
vect = CountVectorizer()
tfidf = TfidfTransformer()
clf = SGDClassifier()
vX = vect.fit_transform(Xtrain)
tfidfX = tfidf.fit_transform(vX)
predicted = clf.fit_predict(tfidfX)
# Now evaluate all steps on test set
vX = vect.fit_transform(Xtest)
tfidfX = tfidf.fit_transform(vX)
predicted = clf.fit_predict(tfidfX)
With just:
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
predicted = pipeline.fit(Xtrain).predict(Xtrain)
# Now evaluate all steps on test set
predicted = pipeline.predict(Xtest)
With pipelines you can easily perform a grid-search over set of parameters for each step of this meta-estimator. As described in the link above. All steps except last one must be transforms, last step can be transformer or predictor.
Answer to edit:
When you call pipln.fit() – each transformer inside pipeline will be fitted on outputs of previous transformer (First transformer is learned on raw dataset). Last estimator may be transformer or predictor, you can call fit_transform() on pipeline only if your last estimator is transformer (that implements fit_transform, or transform and fit methods separately), you can call fit_predict() or predict() on pipeline only if your last estimator is predictor. So you just can’t call fit_transform or transform on pipeline, last step of which is predictor.
I think that M0rkHaV has the right idea. Scikit-learn’s pipeline class is a useful tool for encapsulating multiple different transformers alongside an estimator into one object, so that you only have to call your important methods once (fit(), predict(), etc). Let’s break down the two major components:
Transformers are classes that implement both fit() and transform(). You might be familiar with some of the sklearn preprocessing tools, like TfidfVectorizer and Binarizer. If you look at the docs for these preprocessing tools, you’ll see that they implement both of these methods. What I find pretty cool is that some estimators can also be used as transformation steps, e.g. LinearSVC!
Estimators are classes that implement both fit() and predict(). You’ll find that many of the classifiers and regression models implement both these methods, and as such you can readily test many different models. It is possible to use another transformer as the final estimator (i.e., it doesn’t necessarily implement predict(), but definitely implements fit()). All this means is that you wouldn’t be able to call predict().
As for your edit: let’s go through a text-based example. Using LabelBinarizer, we want to turn a list of labels into a list of binary values.
bin = LabelBinarizer() #first we initialize
vec = ['cat', 'dog', 'dog', 'dog'] #we have our label list we want binarized
Now, when the binarizer is fitted on some data, it will have a structure called classes_ that contains the unique classes that the transformer ‘knows’ about. Without calling fit() the binarizer has no idea what the data looks like, so calling transform() wouldn’t make any sense. This is true if you print out the list of classes before trying to fit the data.
print bin.classes_
I get the following error when trying this:
AttributeError: 'LabelBinarizer' object has no attribute 'classes_'
But when you fit the binarizer on the vec list:
bin.fit(vec)
and try again
print bin.classes_
I get the following:
['cat' 'dog']
print bin.transform(vec)
And now, after calling transform on the vec object, we get the following:
[[0]
[1]
[1]
[1]]
As for estimators being used as transformers, let us use the DecisionTree classifier as an example of a feature-extractor. Decision Trees are great for a lot of reasons, but for our purposes, what’s important is that they have the ability to rank features that the tree found useful for predicting. When you call transform() on a Decision Tree, it will take your input data and find what it thinks are the most important features. So you can think of it transforming your data matrix (n rows by m columns) into a smaller matrix (n rows by k columns), where the k columns are the k most important features that the Decision Tree found.
ML algorithms typically process tabular data. You may want to do preprocessing and post-processing of this data before and after your ML algorithm. A pipeline is a way to chain those data processing steps.
A pipeline is a series of steps in which data is transformed. It comes from the old “pipe and filter” design pattern (for instance, you could think of unix bash commands with pipes “|” or redirect operators “>”). However, pipelines are objects in the code. Thus, you may have a class for each filter (a.k.a. each pipeline step), and then another class to combine those steps into the final pipeline. Some pipelines may combine other pipelines in series or in parallel, have multiple inputs or outputs, and so on. We like to view Machine Learning pipelines as:
Pipe and filters. The pipeline’s steps process data, and they manage their inner state which can be learned from the data.
Composites. Pipelines can be nested: for example a whole pipeline can be treated as a single pipeline step in another pipeline. A pipeline step is not necessarily a pipeline, but a pipeline is itself at least a pipeline step by definition.
Directed Acyclic Graphs (DAG). A pipeline step’s output may be sent to many other steps, and then the resulting outputs can be recombined, and so on. Side note: despite pipelines are acyclic, they can process multiple items one by one, and if their state change (e.g.: using the fit_transform method each time), then they can be viewed as recurrently unfolding through time, keeping their states (think like an RNN). That’s an interesting way to see pipelines for doing online learning when putting them in production and training them on more data.
Methods of a Scikit-Learn Pipeline
Pipelines (or steps in the pipeline) must have those two methods:
“fit” to learn on the data and acquire state (e.g.: neural network’s neural weights are such state)
“transform” (or “predict”) to actually process the data and generate a prediction.
It’s also possible to call this method to chain both:
“fit_transform” to fit and then transform the data, but in one pass, which allows for potential code optimizations when the two methods must be done one after the other directly.
Scikit-Learn’s “pipe and filter” design pattern is simply beautiful. But how to use it for Deep Learning, AutoML, and complex production-level pipelines?
Scikit-Learn had its first release in 2007, which was a pre deep learning era. However, it’s one of the most known and adopted machine learning library, and is still growing. On top of all, it uses the Pipe and Filter design pattern as a software architectural style – it’s what makes Scikit-Learn so fabulous, added to the fact it provides algorithms ready for use. However, it has massive issues when it comes to do the following, which we should be able to do in 2020 already:
Automatic Machine Learning (AutoML),
Deep Learning Pipelines,
More complex Machine Learning pipelines.
Solutions that we’ve Found to Those Scikit-Learn’s Problems
For sure, Scikit-Learn is very convenient and well-built. However, it needs a refresh. Here are our solutions with Neuraxle to make Scikit-Learn fresh and useable within modern computing projects!
Additional pipeline methods and features offered through Neuraxle
Note: if a step of a pipeline doesn’t need to have one of the fit or transform methods, it could inherit from NonFittableMixin or NonTransformableMixin to be provided a default implementation of one of those methods to do nothing.
As a starter, it is possible for pipelines or their steps to also optionally define those methods:
“setup” which will call the “setup” method on each of its step. For instance, if a step contains a TensorFlow, PyTorch, or Keras neural network, the steps could create their neural graphs and register them to the GPU in the “setup” method before fit. It is discouraged to create the graphs directly in the constructors of the steps for several reasons, such as if the steps are copied before running many times with different hyperparameters within an Automatic Machine Learning algorithm that searches for the best hyperparameters for you.
“teardown”, which is the opposite of the “setup” method: it clears resources.
The following methods are provided by default to allow for managing hyperparameters:
“get_hyperparams” will return you a dictionary of the hyperparameters. If your pipeline contains more pipelines (nested pipelines), then the hyperparameter’ keys are chained with double underscores “__” separators.
“set_hyperparams” will allow you to set new hyperparameters in the same format of when you get them.
“get_hyperparams_space” allows you to get the space of hyperparameter, which will be not empty if you defined one. So, the only difference with “get_hyperparams” here is that you’ll get statistic distributions as values instead of a precise value. For instance, one hyperparameter for the number of layers could be a RandInt(1, 3) which means 1 to 3 layers. You can call .rvs() on this dict to pick a value randomly and send it to “set_hyperparams” to try training on it.
“set_hyperparams_space” can be used to set a new space using the same hyperparameter distribution classes as in “get_hyperparams_space”.
For more info on our suggested solutions, read the entries in the big list with links above.
I have Keras installed with the Tensorflow backend and CUDA. I’d like to sometimes on demand force Keras to use CPU. Can this be done without say installing a separate CPU-only Tensorflow in a virtual environment? If so how? If the backend were Theano, the flags could be set, but I have not heard of Tensorflow flags accessible via Keras.
回答 0
如果要强制Keras使用CPU
方式1
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"# see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=""
import tensorflow as tf
from keras import backend as K
num_cores = 4
if GPU:
num_GPU = 1
num_CPU = 1
if CPU:
num_CPU = 1
num_GPU = 0
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
inter_op_parallelism_threads=num_cores,
allow_soft_placement=True,
device_count = {'CPU' : num_CPU,
'GPU' : num_GPU}
)
session = tf.Session(config=config)
K.set_session(session)
Here, with booleansGPU and CPU, we indicate whether we would like to run our code with the GPU or CPU by rigidly defining the number of GPUs and CPUs the Tensorflow session is allowed to access. The variables num_GPU and num_CPU define this value. num_cores then sets the number of CPU cores available for usage via intra_op_parallelism_threads and inter_op_parallelism_threads.
The intra_op_parallelism_threads variable dictates the number of threads a parallel operation in a single node in the computation graph is allowed to use (intra). While the inter_ops_parallelism_threads variable defines the number of threads accessible for parallel operations across the nodes of the computation graph (inter).
allow_soft_placement allows for operations to be run on the CPU if any of the following criterion are met:
there is no GPU implementation for the operation
there are no GPU devices known or registered
there is a need to co-locate with other inputs from the CPU
All of this is executed in the constructor of my class before any other operations, and is completely separable from any model or other code I use.
Note: This requires tensorflow-gpu and cuda/cudnn to be installed because the option is given to use a GPU.
with tf.device('/gpu:0'):
x = tf.placeholder(tf.float32, shape=(None,20,64))
y = LSTM(32)(x)# all ops in the LSTM layer will live on GPU:0with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, shape=(None,20,64))
y = LSTM(32)(x)# all ops in the LSTM layer will live on CPU:0
As per keras tutorial, you can simply use the same tf.device scope as in regular tensorflow:
with tf.device('/gpu:0'):
x = tf.placeholder(tf.float32, shape=(None, 20, 64))
y = LSTM(32)(x) # all ops in the LSTM layer will live on GPU:0
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, shape=(None, 20, 64))
y = LSTM(32)(x) # all ops in the LSTM layer will live on CPU:0
I just spent some time figure it out.
Thoma’s answer is not complete.
Say your program is test.py, you want to use gpu0 to run this program, and keep other gpus free.
You should write CUDA_VISIBLE_DEVICES=0 python test.py
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(0, n_steps, _X)# n_steps
tf.clip_by_value(_X,-1,1, name=None)
This is an example that could be used but where do I introduce this ?
In the def of RNN
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(0, n_steps, _X) # n_steps
tf.clip_by_value(_X, -1, 1, name=None)
But this doesn’t make sense as the tensor _X is the input and not the grad what is to be clipped?
Do I have to define my own Optimizer for this or is there a simpler option?
Gradient clipping needs to happen after computing the gradients, but before applying them to update the model’s parameters. In your example, both of those things are handled by the AdamOptimizer.minimize() method.
In order to clip your gradients you’ll need to explicitly compute, clip, and apply them as described in this section in TensorFlow’s API documentation. Specifically you’ll need to substitute the call to the minimize() method with something like the following:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
gvs = optimizer.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
train_op = optimizer.apply_gradients(capped_gvs)
Clipping each gradient matrix individually changes their relative scale but is also possible:
optimizer = tf.train.AdamOptimizer(1e-3)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients = [
None if gradient is None else tf.clip_by_norm(gradient, 5.0)
for gradient in gradients]
optimize = optimizer.apply_gradients(zip(gradients, variables))
In TensorFlow 2, a tape computes the gradients, the optimizers come from Keras, and we don’t need to store the update op because it runs automatically without passing it to a session:
optimizer = tf.keras.optimizers.Adam(1e-3)
# ...
with tf.GradientTape() as tape:
loss = ...
variables = ...
gradients = tape.gradient(loss, variables)
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimizer.apply_gradients(zip(gradients, variables))
# Create an optimizer.
opt =GradientDescentOptimizer(learning_rate=0.1)# Compute the gradients for a list of variables.
grads_and_vars = opt.compute_gradients(loss,<list of variables>)# grads_and_vars is a list of tuples (gradient, variable). Do whatever you# need to the 'gradient' part, for example cap them, etc.
capped_grads_and_vars =[(MyCapper(gv[0]), gv[1])for gv in grads_and_vars]# Ask the optimizer to apply the capped gradients.
opt.apply_gradients(capped_grads_and_vars)
Calling minimize() takes care of both computing the gradients and
applying them to the variables. If you want to process the gradients
before applying them you can instead use the optimizer in three steps:
Compute the gradients with compute_gradients().
Process the gradients as you wish.
Apply the processed gradients with apply_gradients().
And in the example they provide they use these 3 steps:
# Create an optimizer.
opt = GradientDescentOptimizer(learning_rate=0.1)
# Compute the gradients for a list of variables.
grads_and_vars = opt.compute_gradients(loss, <list of variables>)
# grads_and_vars is a list of tuples (gradient, variable). Do whatever you
# need to the 'gradient' part, for example cap them, etc.
capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
# Ask the optimizer to apply the capped gradients.
opt.apply_gradients(capped_grads_and_vars)
Here MyCapper is any function that caps your gradient. The list of useful functions (other than tf.clip_by_value()) is here.
For those who would like to understand the idea of gradient clipping (by norm):
Whenever the gradient norm is greater than a particular threshold, we clip the gradient norm so that it stays within the threshold. This threshold is sometimes set to 5.
Let the gradient be g and the max_norm_threshold be j.
Gradient Clipping basically helps in case of exploding or vanishing gradients.Say your loss is too high which will result in exponential gradients to flow through the network which may result in Nan values . To overcome this we clip gradients within a specific range (-1 to 1 or any range as per condition) .
clipped_value=tf.clip_by_value(grad, -range, +range), var) for grad, var in grads_and_vars
where grads _and_vars are the pairs of gradients (which you calculate via tf.compute_gradients) and their variables they will be applied to.
After clipping we simply apply its value using an optimizer.
optimizer.apply_gradients(clipped_value)
clf = SVC(kernel='linear', C=1)
clf.fit(X, y)
prediction = clf.predict(X_test)from sklearn.metrics import precision_score, \
recall_score, confusion_matrix, classification_report, \
accuracy_score, f1_scoreprint'Accuracy:', accuracy_score(y_test, prediction)print'F1 score:', f1_score(y_test, prediction)print'Recall:', recall_score(y_test, prediction)print'Precision:', precision_score(y_test, prediction)print'\n clasification report:\n', classification_report(y_test,prediction)print'\n confussion matrix:\n',confusion_matrix(y_test, prediction)
F1 score:/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:676:DeprecationWarning:The default `weighted` averaging is deprecated,andfrom version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception.Please set an explicit value for`average`, one of (None,'micro','macro','weighted','samples').In cross validation use,for instance, scoring="f1_weighted" instead of scoring="f1".
sample_weight=sample_weight)/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1172:DeprecationWarning:The default `weighted` averaging is deprecated,andfrom version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception.Please set an explicit value for`average`, one of (None,'micro','macro','weighted','samples').In cross validation use,for instance, scoring="f1_weighted" instead of scoring="f1".
sample_weight=sample_weight)/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1082:DeprecationWarning:The default `weighted` averaging is deprecated,andfrom version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception.Please set an explicit value for`average`, one of (None,'micro','macro','weighted','samples').In cross validation use,for instance, scoring="f1_weighted" instead of scoring="f1".
sample_weight=sample_weight)0.930416613529
但是,我收到这样的警告:
/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1172:DeprecationWarning:The default `weighted` averaging is deprecated,andfrom version 0.18, use of precision, recall or F-score with
multiclass or multilabel data or pos_label=None will result in an
exception.Please set an explicit value for`average`, one of (None,'micro','macro','weighted','samples').In cross validation use,for
instance, scoring="f1_weighted" instead of scoring="f1"
I’m working in a sentiment analysis problem the data looks like this:
label instances
5 1190
4 838
3 239
1 204
2 127
So my data is unbalanced since 1190 instances are labeled with 5. For the classification Im using scikit’s SVC. The problem is I do not know how to balance my data in the right way in order to compute accurately the precision, recall, accuracy and f1-score for the multiclass case. So I tried the following approaches:
clf = SVC(kernel='linear', C= 1)
clf.fit(X, y)
prediction = clf.predict(X_test)
from sklearn.metrics import precision_score, \
recall_score, confusion_matrix, classification_report, \
accuracy_score, f1_score
print 'Accuracy:', accuracy_score(y_test, prediction)
print 'F1 score:', f1_score(y_test, prediction)
print 'Recall:', recall_score(y_test, prediction)
print 'Precision:', precision_score(y_test, prediction)
print '\n clasification report:\n', classification_report(y_test,prediction)
print '\n confussion matrix:\n',confusion_matrix(y_test, prediction)
F1 score:/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
sample_weight=sample_weight)
/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1172: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
sample_weight=sample_weight)
/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1082: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
sample_weight=sample_weight)
0.930416613529
However, Im getting warnings like this:
/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1172:
DeprecationWarning: The default `weighted` averaging is deprecated,
and from version 0.18, use of precision, recall or F-score with
multiclass or multilabel data or pos_label=None will result in an
exception. Please set an explicit value for `average`, one of (None,
'micro', 'macro', 'weighted', 'samples'). In cross validation use, for
instance, scoring="f1_weighted" instead of scoring="f1"
How can I deal correctly with my unbalanced data in order to compute in the right way classifier’s metrics?
precision recall f1-score support
00.651.000.791710.570.750.651620.330.060.1017
avg / total 0.520.600.5150
警告
F1 score:/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:676:DeprecationWarning:The
default `weighted` averaging is deprecated,andfrom version 0.18,
use of precision, recall or F-score with multiclass or multilabel data
or pos_label=None will result in an exception.Please set an explicit
value for`average`, one of (None,'micro','macro','weighted','samples').In cross validation use,for instance,
scoring="f1_weighted" instead of scoring="f1".
I think there is a lot of confusion about which weights are used for what. I am not sure I know precisely what bothers you so I am going to cover different topics, bear with me ;).
Class weights
The weights from the class_weight parameter are used to train the classifier.
They are not used in the calculation of any of the metrics you are using: with different class weights, the numbers will be different simply because the classifier is different.
Basically in every scikit-learn classifier, the class weights are used to tell your model how important a class is. That means that during the training, the classifier will make extra efforts to classify properly the classes with high weights.
How they do that is algorithm-specific. If you want details about how it works for SVC and the doc does not make sense to you, feel free to mention it.
The metrics
Once you have a classifier, you want to know how well it is performing.
Here you can use the metrics you mentioned: accuracy, recall_score, f1_score…
Usually when the class distribution is unbalanced, accuracy is considered a poor choice as it gives high scores to models which just predict the most frequent class.
I will not detail all these metrics but note that, with the exception of accuracy, they are naturally applied at the class level: as you can see in this print of a classification report they are defined for each class. They rely on concepts such as true positives or false negative that require defining which class is the positive one.
F1 score:/usr/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:676: DeprecationWarning: The
default `weighted` averaging is deprecated, and from version 0.18,
use of precision, recall or F-score with multiclass or multilabel data
or pos_label=None will result in an exception. Please set an explicit
value for `average`, one of (None, 'micro', 'macro', 'weighted',
'samples'). In cross validation use, for instance,
scoring="f1_weighted" instead of scoring="f1".
You get this warning because you are using the f1-score, recall and precision without defining how they should be computed!
The question could be rephrased: from the above classification report, how do you output one global number for the f1-score?
You could:
Take the average of the f1-score for each class: that’s the avg / total result above. It’s also called macro averaging.
Compute the f1-score using the global count of true positives / false negatives, etc. (you sum the number of true positives / false negatives for each class). Aka micro averaging.
Compute a weighted average of the f1-score. Using 'weighted' in scikit-learn will weigh the f1-score by the support of the class: the more elements a class has, the more important the f1-score for this class in the computation.
These are 3 of the options in scikit-learn, the warning is there to say you have to pick one. So you have to specify an average argument for the score method.
Which one you choose is up to how you want to measure the performance of the classifier: for instance macro-averaging does not take class imbalance into account and the f1-score of class 1 will be just as important as the f1-score of class 5. If you use weighted averaging however you’ll get more importance for the class 5.
The whole argument specification in these metrics is not super-clear in scikit-learn right now, it will get better in version 0.18 according to the docs. They are removing some non-obvious standard behavior and they are issuing warnings so that developers notice it.
Computing scores
Last thing I want to mention (feel free to skip it if you’re aware of it) is that scores are only meaningful if they are computed on data that the classifier has never seen.
This is extremely important as any score you get on data that was used in fitting the classifier is completely irrelevant.
Here’s a way to do it using StratifiedShuffleSplit, which gives you a random splits of your data (after shuffling) that preserve the label distribution.
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# We use a utility to generate artificial classification data.
X, y = make_classification(n_samples=100, n_informative=10, n_classes=3)
sss = StratifiedShuffleSplit(y, n_iter=1, test_size=0.5, random_state=0)
for train_idx, test_idx in sss:
X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
Hope this helps.
回答 1
这里有很多非常详细的答案,但我认为您没有回答正确的问题。据我了解的问题,有两个问题:
我如何为多类问题评分?
我该如何处理不平衡的数据?
1。
可以将scikit-learn中的大多数计分函数用于多类问题和单类问题。例如:
from sklearn.metrics import precision_recall_fscore_support as score
predicted =[1,2,3,4,5,1,2,1,1,4,5]
y_test =[1,2,3,4,5,1,2,1,1,4,1]
precision, recall, fscore, support = score(y_test, predicted)print('precision: {}'.format(precision))print('recall: {}'.format(recall))print('fscore: {}'.format(fscore))print('support: {}'.format(support))
… you can tell if the unbalanced data is even a problem. If the scoring for the less represented classes (class 1 and 2) are lower than for the classes with more training samples (class 4 and 5) then you know that the unbalanced data is in fact a problem, and you can act accordingly, as described in some of the other answers in this thread.
However, if the same class distribution is present in the data you want to predict on, your unbalanced training data is a good representative of the data, and hence, the unbalance is a good thing.
Responding to the question ‘what metric should be used for multi-class classification with imbalanced data’: Macro-F1-measure.
Macro Precision and Macro Recall can be also used, but they are not so easily interpretable as for binary classificaion, they are already incorporated into F-measure, and excess metrics complicate methods comparison, parameters tuning, and so on.
Micro averaging are sensitive to class imbalance: if your method, for example, works good for the most common labels and totally messes others, micro-averaged metrics show good results.
Weighting averaging isn’t well suited for imbalanced data, because it weights by counts of labels. Moreover, it is too hardly interpretable and unpopular: for instance, there is no mention of such an averaging in the following very detailed survey I strongly recommend to look through:
Sokolova, Marina, and Guy Lapalme. “A systematic analysis of
performance measures for classification tasks.” Information Processing
& Management 45.4 (2009): 427-437.
Application-specific question
However, returning to your task, I’d research 2 topics:
metrics commonly used for your specific task – it lets (a) to
compare your method with others and understand if you do something
wrong, and (b) to not explore this by yourself and reuse someone
else’s findings;
cost of different errors of your methods – for
example, use-case of your application may rely on 4- and 5-star
reviewes only – in this case, good metric should count only these 2
labels.
Commonly used metrics.
As I can infer after looking through literature, there are 2 main evaluation metrics:
Yu, April, and Daryl Chang. “Multiclass Sentiment Prediction using
Yelp Business.”
(link) – note that the authors work with almost the same distribution of ratings, see Figure 5.
Pang, Bo, and Lillian Lee. “Seeing stars: Exploiting class
relationships for sentiment categorization with respect to rating
scales.” Proceedings of the 43rd Annual Meeting on Association for
Computational Linguistics. Association for Computational Linguistics,
2005.
MSE (or, less often, Mean Absolute Error – MAE) – see, for example,
Lee, Moontae, and R. Grafe. “Multiclass sentiment analysis with
restaurant reviews.” Final Projects from CS N 224 (2010).
(link) – they explore both accuracy and MSE, considering the latter to be better
Pappas, Nikolaos, Rue Marconi, and Andrei Popescu-Belis. “Explaining
the Stars: Weighted Multiple-Instance Learning for Aspect-Based
Sentiment Analysis.” Proceedings of the 2014 Conference on Empirical
Methods In Natural Language Processing. No. EPFL-CONF-200899. 2014.
(link) – they utilize scikit-learn for evaluation and baseline approaches and state that their code is available; however, I can’t find it, so if you need it, write a letter to the authors, the work is pretty new and seems to be written in Python.
Cost of different errors.
If you care more about avoiding gross blunders, e.g. assinging 1-star to 5-star review or something like that, look at MSE;
if difference matters, but not so much, try MAE, since it doesn’t square diff;
otherwise stay with Accuracy.
About approaches, not metrics
Try regression approaches, e.g. SVR, since they generally outperforms Multiclass classifiers like SVC or OVA SVM.
First of all it’s a little bit harder using just counting analysis to tell if your data is unbalanced or not. For example: 1 in 1000 positive observation is just a noise, error or a breakthrough in science? You never know.
So it’s always better to use all your available knowledge and choice its status with all wise.
Okay, what if it’s really unbalanced?
Once again — look to your data. Sometimes you can find one or two observation multiplied by hundred times. Sometimes it’s useful to create this fake one-class-observations.
If all the data is clean next step is to use class weights in prediction model.
So what about multiclass metrics?
In my experience none of your metrics is usually used. There are two main reasons.
First: it’s always better to work with probabilities than with solid prediction (because how else could you separate models with 0.9 and 0.6 prediction if they both give you the same class?)
And second: it’s much easier to compare your prediction models and build new ones depending on only one good metric.
From my experience I could recommend logloss or MSE (or just mean squared error).
How to fix sklearn warnings?
Just simply (as yangjie noticed) overwrite average parameter with one of these
values: 'micro' (calculate metrics globally), 'macro' (calculate metrics for each label) or 'weighted' (same as macro but with auto weights).
f1_score(y_test, prediction, average='weighted')
All your Warnings came after calling metrics functions with default average value 'binary' which is inappropriate for multiclass prediction.
Good luck and have fun with machine learning!
Edit:
I found another answerer recommendation to switch to regression approaches (e.g. SVR) with which I cannot agree. As far as I remember there is no even such a thing as multiclass regression. Yes there is multilabel regression which is far different and yes it’s possible in some cases switch between regression and classification (if classes somehow sorted) but it pretty rare.
What I would recommend (in scope of scikit-learn) is to try another very powerful classification tools: gradient boosting, random forest (my favorite), KNeighbors and many more.
After that you can calculate arithmetic or geometric mean between predictions and most of the time you’ll get even better result.
The idea behind StandardScaler is that it will transform your data such that its distribution will have a mean value 0 and standard deviation of 1.
In case of multivariate data, this is done feature-wise (in other words independently for each column of the data).
Given the distribution of the data, each value in the dataset will have the mean value subtracted, and then divided by the standard deviation of the whole dataset (or feature in the multivariate case).
from sklearn.preprocessing importStandardScalerimport numpy as np
# 4 samples/observations and 2 variables/features
data = np.array([[0,0],[1,0],[0,1],[1,1]])
scaler =StandardScaler()
scaled_data = scaler.fit_transform(data)print(data)[[0,0],[1,0],[0,1],[1,1]])print(scaled_data)[[-1.-1.][1.-1.][-1.1.][1.1.]]
Intro: I assume that you have a matrix X where each row/line is a sample/observation and each column is a variable/feature (this is the expected input for any sklearn ML function by the way — X.shape should be [number_of_samples, number_of_features]).
Core of method: The main idea is to normalize/standardize i.e. μ = 0 and σ = 1 your features/variables/columns of X, individually, before applying any machine learning model.
StandardScaler() will normalize the features i.e. each column of X, INDIVIDUALLY, so that each column/feature/variable will have μ = 0 and σ = 1.
P.S: I find the most upvoted answer on this page, wrong.
I am quoting “each value in the dataset will have the sample mean value subtracted” — This is neither true nor correct.
StandardScaler performs the task of Standardization. Usually a dataset contains variables that are different in scale. For e.g. an Employee dataset will contain AGE column with values on scale 20-70 and SALARY column with values on scale 10000-80000. As these two columns are different in scale, they are Standardized to have common scale while building machine learning model.
This is useful when you want to compare data that correspond to different units. In that case, you want to remove the units. To do that in a consistent way of all the data, you transform the data in a way that the variance is unitary and that the mean of the series is 0.
import pandas as pd
import scipy.stats as ss
from sklearn.preprocessing importStandardScaler
data=[[1,1,1,1,1],[2,5,10,50,100],[3,10,20,150,200],[4,15,40,200,300]]
df = pd.DataFrame(data, columns=['N0','N1','N2','N3','N4']).astype('float64')
sc_X =StandardScaler()
df = sc_X.fit_transform(df)
num_cols = len(df[0,:])for i in range(num_cols):
col = df[:,i]
col_stats = ss.describe(col)print(col_stats)
The answers above are great, but I needed a simple example to alleviate some concerns that I have had in the past. I wanted to make sure it was indeed treating each column separately. I am now reassured and can’t find what example had caused me concern. All columns ARE scaled separately as described by those above.
CODE
import pandas as pd
import scipy.stats as ss
from sklearn.preprocessing import StandardScaler
data= [[1, 1, 1, 1, 1],[2, 5, 10, 50, 100],[3, 10, 20, 150, 200],[4, 15, 40, 200, 300]]
df = pd.DataFrame(data, columns=['N0', 'N1', 'N2', 'N3', 'N4']).astype('float64')
sc_X = StandardScaler()
df = sc_X.fit_transform(df)
num_cols = len(df[0,:])
for i in range(num_cols):
col = df[:,i]
col_stats = ss.describe(col)
print(col_stats)
The scipy.stats module is correctly reporting the “sample” variance, which uses (n – 1) in the denominator. The “population” variance would use n in the denominator for the calculation of variance. To understand better, please see the code below that uses scaled data from the first column of the data set above:
Code
import scipy.stats as ss
sc_Data = [[-1.34164079], [-0.4472136], [0.4472136], [1.34164079]]
col_stats = ss.describe([-1.34164079, -0.4472136, 0.4472136, 1.34164079])
print(col_stats)
print()
mean_by_hand = 0
for row in sc_Data:
for element in row:
mean_by_hand += element
mean_by_hand /= 4
variance_by_hand = 0
for row in sc_Data:
for element in row:
variance_by_hand += (mean_by_hand - element)**2
sample_variance_by_hand = variance_by_hand / 3
sample_std_dev_by_hand = sample_variance_by_hand ** 0.5
pop_variance_by_hand = variance_by_hand / 4
pop_std_dev_by_hand = pop_variance_by_hand ** 0.5
print("Sample of Population Calcs:")
print(mean_by_hand, sample_variance_by_hand, sample_std_dev_by_hand, '\n')
print("Population Calcs:")
print(mean_by_hand, pop_variance_by_hand, pop_std_dev_by_hand)
Output
DescribeResult(nobs=4, minmax=(-1.34164079, 1.34164079), mean=0.0, variance=1.3333333422778562, skewness=0.0, kurtosis=-1.36000000429325)
Sample of Population Calcs:
0.0 1.3333333422778562 1.1547005422523435
Population Calcs:
0.0 1.000000006708392 1.000000003354196
回答 6
以下是一个简单的工作示例,用于解释标准化计算的工作原理。理论部分已经在其他答案中得到了很好的解释。
>>>import numpy as np
>>>data =[[6,2],[4,2],[6,4],[8,2]]>>>a = np.array(data)>>>np.std(a, axis=0)
array([1.41421356,0.8660254])>>>np.mean(a, axis=0)
array([6.,2.5])>>>from sklearn.preprocessing importStandardScaler>>>scaler =StandardScaler()>>>scaler.fit(data)>>>print(scaler.mean_)#Xchanged = (X−μ)/σ WHERE σ is Standard Deviation and μ is mean>>>z=scaler.transform(data)>>>z