module MachineLearning.Clustering
(
Cluster(..)
, kmeans
, kmeansIterM
)
where
import MachineLearning.Types (R, Vector, Matrix)
import Data.List (sortOn, groupBy, minimumBy)
import Control.Applicative ((<$>))
import Control.Monad (forM)
import qualified Control.Monad.Random as RndM
import qualified Data.Vector as V
import qualified Numeric.LinearAlgebra as LA
import MachineLearning.Random (sampleM)
type Cluster = V.Vector Vector
nearestCentroidIndex :: V.Vector Vector
-> Vector
-> Int
nearestCentroidIndex centroids v =
let distances = V.map (\centroid -> LA.norm_2 (v-centroid)) centroids
in V.minIndex distances
calcClusterCost :: Cluster
-> Vector
-> R
calcClusterCost cluster centroid = sum $ fmap (\sample -> LA.norm_2 $ sample-centroid) cluster
calcCost :: V.Vector Cluster
-> V.Vector Vector
-> R
calcCost clusters centroids = sum $ V.zipWith calcClusterCost clusters centroids
getNewCentroid :: Cluster
-> Vector
getNewCentroid cluster =
let n = length cluster
centroid = (sum cluster) / (fromIntegral n)
in centroid
moveCentroids :: V.Vector Cluster
-> V.Vector Vector
moveCentroids clusters = fmap getNewCentroid clusters
buildClusterList :: V.Vector Vector
-> V.Vector Int
-> V.Vector Cluster
buildClusterList samples clusterIndicesList = V.fromList $ fmap getClusterSamples clusters''
where clusters' = groupBy (\l r -> snd l == snd r) $ sortOn snd $ zip [0..] $ V.toList clusterIndicesList
clusters'' = map (map fst) clusters'
getClusterSamples clusterIndices = V.fromList $ fmap (samples V.!) clusterIndices
kmeansIter :: V.Vector Vector
-> Int
-> V.Vector Vector
-> (V.Vector Cluster, [R])
kmeansIter samples k initialCentroids =
let iter centroids js =
let clusterIndicesList = fmap (nearestCentroidIndex centroids) samples
clusters = buildClusterList samples clusterIndicesList
centroids' = moveCentroids clusters
j = calcCost clusters centroids'
diff = sum . fmap LA.norm_2 $ V.zipWith (-) centroids centroids'
in if diff < 0.001 then (clusters, j:js)
else iter centroids' (j:js)
in iter initialCentroids []
kmeansIterM :: RndM.RandomGen g =>
V.Vector Vector
-> Int
-> Int
-> RndM.Rand g (V.Vector Cluster, [R])
kmeansIterM samples k _ = do
centroids <- sampleM k samples
return (kmeansIter samples k centroids)
kmeans :: RndM.RandomGen g =>
Int
-> Matrix
-> Int
-> RndM.Rand g (V.Vector Cluster)
kmeans nIters x k = fst <$>
(minimumBy (\(_, js1) (_, js2) -> compare (head js1) (head js2))) <$>
forM [1..nIters] (kmeansIterM samples k)
where samples = V.fromList $ LA.toRows x