Created
September 18, 2014 11:43
-
-
Save ian-ross/d39bf143ffc482ea3700 to your computer and use it in GitHub Desktop.
Two-dimensional PCA example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: pca-2d | |
version: 0.1 | |
synopsis: Two-dimensional PCA example | |
license: BSD3 | |
author: Ian Ross | |
maintainer: [email protected] | |
copyright: Copyright (2014) Ian Ross | |
category: Data | |
build-type: Simple | |
cabal-version: >=1.8 | |
Executable pca-2d | |
main-is: pca-2d.hs | |
build-depends: base >= 4.3 && < 5, | |
hmatrix == 0.16.0.5, | |
Chart == 1.3.1, | |
Chart-cairo == 1.3.1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Control.Monad | |
import Numeric.LinearAlgebra.HMatrix | |
import Graphics.Rendering.Chart.Easy hiding (Matrix, Vector, (|>), scale) | |
import Graphics.Rendering.Chart.Backend.Cairo | |
-- Number of test data points. | |
n :: Int | |
n = 500 | |
-- Mean, standard deviation and correlation for two dimensions of test | |
-- data. | |
meanx, meany, sdx, sdy, rho :: Double | |
meanx = 5.0 ; meany = 3.0 ; sdx = 1.2 ; sdy = 0.6 ; rho = 0.75 | |
-- Generate test data. | |
generateTestData :: Matrix Double | |
generateTestData = | |
let seed = 1023 | |
mean = 2 |> [ meanx, meany ] | |
cov = matrix 2 [ sdx^2 , rho*sdx*sdy | |
, rho*sdx*sdy , sdy^2 ] | |
samples = gaussianSample seed n mean cov | |
in fromRows $ filter ((> 0) . minElement) $ toRows samples | |
-- Perform PCA decomposition and project data points onto PCA | |
-- eigenvectors: return value is (data mean; PCA eigenvalues in | |
-- descending order; explained variance per PCA eigenvector; PCA | |
-- eigenvectors as rows of a matrix; projections of data points onto | |
-- PCA eigenvectors, one per row of a matrix). | |
pca :: Matrix Double -> (Vector Double, Matrix Double, Matrix Double) | |
pca xs = (varexp, evecs, projs) | |
where (mean, cov) = meanCov xs | |
(_, evals, evecCols) = svd cov | |
evecs = fromRows $ map evecSigns $ toColumns evecCols | |
evecSigns ev = let maxelidx = maxIndex $ cmap abs ev | |
sign = signum (ev ! maxelidx) | |
in cmap (sign *) ev | |
varexp = scale (1.0 / sumElements evals) evals | |
projs = fromRows $ map project $ toRows xs | |
project x = evecs #> (x - mean) | |
-- Make plots. | |
doPlot :: FileFormat -> String -> | |
Matrix Double -> Matrix Double -> Matrix Double -> Int | |
-> IO () | |
doPlot ptype suffix dat evecs projs idx = do | |
-- Set up plot characteristics depending on which one we're doing. | |
let (showEvecs, showProjs) = case idx of | |
2 -> (True, False) | |
3 -> (True, True) | |
_ -> (False, False) | |
-- Turn the eigenvectors into plotting data. | |
let toTuple v = (v ! 0, v ! 1) | |
makeEvec idx = ((0, 0), toTuple $ toRows evecs !! idx) | |
evs = [makeEvec 0, makeEvec 1] | |
-- Set up the plot range. | |
let vdat = flatten dat | |
rng = (minElement vdat, maxElement vdat) | |
-- Choose a "typical" point to project. | |
let typposs = map (\x -> x < 3 && x > 2) $ | |
toList $ flatten $ takeColumns 1 projs | |
typys = fromList $ zipWith (\f y -> if f && y > 0.0 then y else 0.0) | |
typposs (toList $ flatten $ dropColumns 1 projs) | |
typidx = maxIndex typys | |
typical = dat ! typidx | |
-- Make square plots... | |
let fname = "pca-2d-" ++ show idx ++ "." ++ suffix | |
toFile (fo_format .~ ptype $ fo_size .~ (600, 600) $ def) fname $ do | |
-- Bigger axis labels and axis labelling. | |
layout_x_axis . laxis_style . axis_label_style . font_size .= 20.0 | |
layout_y_axis . laxis_style . axis_label_style . font_size .= 20.0 | |
when (idx == 0) $ do | |
layout_x_axis . laxis_title .= "Shell length (cm)" | |
layout_x_axis . laxis_title_style . font_size .= 20.0 | |
layout_y_axis . laxis_title .= "Shell width (cm)" | |
layout_y_axis . laxis_title_style . font_size .= 20.0 | |
-- Make both axes cover the same range to get orthogonal-looking | |
-- eigenvectors. | |
layout_x_axis . laxis_generate .= scaledAxis def rng | |
layout_y_axis . laxis_generate .= scaledAxis def rng | |
-- Plot data points. | |
plot $ liftEC $ do | |
plot_points_style . point_radius .= 4 | |
plot_points_style . point_color .= | |
(if idx < 2 then opaque red else withOpacity red 0.5) | |
plot_points_values .= (map (\v -> (v ! 0, v ! 1)) $ toRows dat) | |
-- Plot "typical" point. | |
when showProjs $ plot $ liftEC $ do | |
plot_points_style . point_radius .= 6 | |
plot_points_style . point_color .= opaque green | |
plot_points_values .= [(typical ! 0, typical ! 1)] | |
-- Plot "typical" point projections. | |
let t1 = dat ! typidx ! 0 ; t2 = dat ! typidx ! 1 | |
c1 = projs ! typidx ! 0 ; c2 = projs ! typidx ! 1 | |
e1 = evecs ! 0 ; e2 = evecs ! 1 | |
when showProjs $ do | |
projection c1 e1 (dat ! typidx) | |
projection c2 e2 (dat ! typidx) | |
-- Plot eigenvectors. | |
when showEvecs $ plot (vectorField evs) | |
projection c e t = plot $ liftEC $ do | |
plot_lines_style . line_color .= opaque green | |
plot_lines_values .= [[(0, 0), (c * e ! 0, c * e ! 1), (t ! 0, t ! 1)]] | |
vectorField dat = fmap plotVectorField $ liftEC $ do | |
plot_vectors_values .= dat | |
plot_vectors_scale .= 0 | |
plot_vectors_style . vector_line_style . line_color .= opaque black | |
plot_vectors_style . vector_line_style . line_width .= 3.5 | |
plot_vectors_style . vector_head_style . point_color .= opaque black | |
plot_vectors_style . vector_head_style . point_radius .= 7.0 | |
main :: IO () | |
main = do | |
let dat = generateTestData | |
(varexp, evecs, projs) = pca dat | |
(mean, cov) = meanCov dat | |
cdat = fromRows $ map (subtract mean) $ toRows dat | |
forM_ [(PNG, "png"), (PDF, "pdf"), (SVG, "svg")] $ \(ptype, suffix) -> do | |
doPlot ptype suffix dat evecs projs 0 | |
doPlot ptype suffix cdat evecs projs 1 | |
doPlot ptype suffix cdat evecs projs 2 | |
doPlot ptype suffix cdat evecs projs 3 | |
putStrLn $ "FRACTIONAL VARIANCE EXPLAINED: " ++ show varexp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment