Skip to content

Instantly share code, notes, and snippets.

@ian-ross
Last active August 29, 2015 14:07
Show Gist options
  • Save ian-ross/18425aa48e16e7dba02a to your computer and use it in GitHub Desktop.
Save ian-ross/18425aa48e16e7dba02a to your computer and use it in GitHub Desktop.
Haskell data analysis: Z500 anomaly data PCA calculation
module Main where
import Control.Applicative ((<$>))
import Control.Monad (forM_, foldM, when)
import System.FilePath
import qualified Data.Array.Repa as Repa
import qualified Data.Array.Repa.Eval as Repa
import Data.Array.Repa.Repr.ForeignPtr (F)
import Data.NetCDF
import Data.NetCDF.HMatrix
import Foreign.C
import qualified Data.Map as M
import qualified Data.NetCDF.Vector as V
import qualified Data.NetCDF.Repa as R
import qualified Data.Vector.Storable as SV
import Numeric.LinearAlgebra.HMatrix
import Data.Packed.Matrix (extractColumns)
import Debug.Trace
type SVRet a = IO (Either NcError (SV.Vector a))
type FArray3 a = Repa.Array F Repa.DIM3 a
type FArray2 a = Repa.Array F Repa.DIM2 a
type RepaRet3 a = IO (Either NcError (FArray3 a))
type RepaRet2 a = IO (Either NcError (FArray2 a))
type RepaRet1 a = IO (Either NcError (Repa.Array F Repa.DIM1 a))
basedir, indir, outdir :: FilePath
basedir = "/big/project-storage/haskell-data-analysis/atmos-non-diffusive"
indir = basedir </> "pre-processing"
outdir = basedir </> "pca"
type V = Vector Double
type M = Matrix Double
-- Offline calculation of mean and covariance of a set of data vectors.
meanCovM :: Int -> (Int -> IO V) -> IO (V, M)
meanCovM nrow readrow = do
-- Accumulate values for mean calculation.
refrow <- readrow 0
let maddone acc i = do
row <- readrow i
return $! acc + row
mtotal <- foldM maddone refrow [1..nrow-1]
-- Calculate sample mean.
let mean = scale (1.0 / fromIntegral nrow) mtotal
-- Accumulate differences from mean for covariance calculation.
let refdiff = refrow - mean
caddone acc i = do
row <- readrow i
let diff = row - mean
return $! acc + (diff `outer` diff)
ctotal <- foldM caddone (refdiff `outer` refdiff) [1..nrow-1]
-- Calculate sample covariance.
let cov = scale (1.0 / fromIntegral (nrow - 1)) ctotal
return (mean, cov)
-- Perform PCA decomposition and project data points onto PCA
-- eigenvectors: return value is (data mean; PCA eigenvalues in
-- descending order; explained variance per PCA eigenvector; PCA
-- eigenvectors as rows of a matrix; projections of data points onto
-- PCA eigenvectors, one per row of a matrix).
--
-- Here, the decomposition is done "offline", using IO actions to read
-- data rows and to write projected data outputs.
pcaM :: Int -> (Int -> IO V) -> (Int -> V -> IO ()) -> IO (V, M)
pcaM nrow readrow writeproj = do
(mean, cov) <- meanCovM nrow readrow
let (_, evals, evecCols) = svd cov
evecs = fromRows $ map evecSigns $ toColumns evecCols
evecSigns ev = let maxelidx = maxIndex $ cmap abs ev
sign = signum (ev ! maxelidx)
in cmap (sign *) ev
varexp = scale (1.0 / sumElements evals) evals
project x = evecs #> (x - mean)
forM_ [0..nrow-1] $ \r -> readrow r >>= writeproj r . project
return (varexp, evecs)
main :: IO ()
main = do
-- Open anomaly data file for input.
Right innc <- openFile $ indir </> "z500-anomalies.nc"
let Just nlat = ncDimLength <$> ncDim innc "lat"
Just nlon = ncDimLength <$> ncDim innc "lon"
Just ntime = ncDimLength <$> ncDim innc "time"
nspace = nlat * nlon
let (Just lonvar) = ncVar innc "lon"
(Just latvar) = ncVar innc "lat"
(Just timevar) = ncVar innc "time"
(Just z500var) = ncVar innc "z500"
Right lon <- get innc lonvar :: SVRet CFloat
Right lat <- get innc latvar :: SVRet CFloat
Right time <- get innc timevar :: SVRet CDouble
-- Set up output file.
let outlatdim = NcDim "lat" nlat False
outlatvar = NcVar "lat" NcFloat [outlatdim] (ncVarAttrs latvar)
outlondim = NcDim "lon" nlon False
outlonvar = NcVar "lon" NcFloat [outlondim] (ncVarAttrs lonvar)
outtimedim = NcDim "time" 0 True
outtimevar = NcVar "time" NcDouble [outtimedim] (ncVarAttrs timevar)
outpcdim = NcDim "pc" nspace False
outpcvar = NcVar "pc" NcInt [outpcdim] M.empty
outepatvar = NcVar "epat" NcDouble
[outpcdim, outlatdim, outlondim] M.empty
outvarexpvar = NcVar "varexp" NcDouble [outpcdim] M.empty
outprojvar = NcVar "proj" NcDouble [outtimedim, outpcdim] M.empty
outncinfo =
emptyNcInfo (outdir </> "z500-pca-offline.nc") #
addNcDim outlatdim # addNcDim outlondim #
addNcDim outtimedim # addNcDim outpcdim #
addNcVar outlatvar # addNcVar outlonvar #
addNcVar outtimevar # addNcVar outpcvar #
addNcVar outepatvar # addNcVar outvarexpvar # addNcVar outprojvar
-- Data scaling based on square root of cos of latitude.
let latscale = SV.map (\lt -> realToFrac $ sqrt $ cos (lt / 180.0 * pi)) lat
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $
\outnc -> do
-- Write coordinate variable values.
put outnc outlatvar lat
put outnc outlonvar lon
putA outnc outtimevar [0] [ntime] time
put outnc outpcvar $ (SV.enumFromN 0 nspace :: SV.Vector CInt)
-- Worker to read a single data row and convert anomaly data to
-- floating point values, scaling by square root of cos of
-- latitude.
let readrow :: Int -> IO V
readrow it = do
let start = [it, 0, 0]
count = [1, nlat, nlon]
Right z500short <- getA innc z500var start count :: RepaRet2 CShort
let z500 = build nspace
(\s -> let is = truncate s :: Int
(ilat, ilon) = divMod is nlon
i = Repa.Z Repa.:. ilat Repa.:. ilon
in (latscale ! ilat) *
(fromIntegral $ z500short Repa.! i))
return z500
-- Worker to write a single data row projection.
let writeproj it proj = do
let cproj = cmap realToFrac proj :: Vector CDouble
prstart = [it, 0]
prcount = [1, nspace]
putA outnc outprojvar prstart prcount $ HVector cproj
return ()
-- Perform PCA.
(varexp, evecs) <- pcaM ntime readrow writeproj
putStrLn $ show $ map (100 *) $ take 10 $ toList varexp
putStrLn $ show $ sum $ toList varexp
-- Write explained variance and PCA eigenpatterns.
let outvarexp =
SV.fromList $ map realToFrac $ toList varexp :: SV.Vector CDouble
put outnc outvarexpvar outvarexp
forM_ [0..nspace-1] $ \is -> do
let pat = cmap realToFrac $ reshape nlon $ evecs ! is :: Matrix CDouble
pastart = [is, 0, 0]
pacount = [1, nlat, nlon]
putA outnc outepatvar pastart pacount $ HMatrix pat
return ()
module Main where
import Control.Applicative ((<$>))
import Control.Monad (forM_)
import System.FilePath
import qualified Data.Array.Repa as Repa
import qualified Data.Array.Repa.Eval as Repa
import Data.Array.Repa.Repr.ForeignPtr (F)
import Data.NetCDF
import Data.NetCDF.HMatrix
import Foreign.C
import qualified Data.Map as M
import qualified Data.NetCDF.Vector as V
import qualified Data.NetCDF.Repa as R
import qualified Data.Vector.Storable as SV
import Numeric.LinearAlgebra.HMatrix
import Data.Packed.Matrix (extractColumns)
import Debug.Trace
type SVRet a = IO (Either NcError (SV.Vector a))
type FArray3 a = Repa.Array F Repa.DIM3 a
type FArray2 a = Repa.Array F Repa.DIM2 a
type RepaRet3 a = IO (Either NcError (FArray3 a))
type RepaRet2 a = IO (Either NcError (FArray2 a))
type RepaRet1 a = IO (Either NcError (Repa.Array F Repa.DIM1 a))
basedir, indir, outdir :: FilePath
basedir = "/big/project-storage/haskell-data-analysis/atmos-non-diffusive"
indir = basedir </> "pre-processing"
outdir = basedir </> "pca"
-- Perform PCA decomposition and project data points onto PCA
-- eigenvectors: return value is (data mean; PCA eigenvalues in
-- descending order; explained variance per PCA eigenvector; PCA
-- eigenvectors as rows of a matrix; projections of data points onto
-- PCA eigenvectors, one per row of a matrix).
pca :: Matrix Double -> (Vector Double, Matrix Double, Matrix Double)
pca xs = (varexp, evecs, projs)
where (mean, cov) = meanCov xs
(_, evals, evecCols) = svd cov
evecs = fromRows $ map evecSigns $ toColumns evecCols
evecSigns ev = let maxelidx = maxIndex $ cmap abs ev
sign = signum (ev ! maxelidx)
in cmap (sign *) ev
varexp = scale (1.0 / sumElements evals) evals
projs = fromRows $ map project $ toRows xs
project x = evecs #> (x - mean)
main :: IO ()
main = do
-- Open anomaly data file for input.
Right innc <- openFile $ indir </> "z500-anomalies.nc"
let Just nlat = ncDimLength <$> ncDim innc "lat"
Just nlon = ncDimLength <$> ncDim innc "lon"
Just ntime = ncDimLength <$> ncDim innc "time"
nspace = nlat * nlon
let (Just lonvar) = ncVar innc "lon"
(Just latvar) = ncVar innc "lat"
(Just timevar) = ncVar innc "time"
(Just z500var) = ncVar innc "z500"
Right lon <- get innc lonvar :: SVRet CFloat
Right lat <- get innc latvar :: SVRet CFloat
Right time <- get innc timevar :: SVRet CDouble
Right z500short <- get innc z500var :: RepaRet3 CShort
-- Convert anomaly data to a matrix of floating point values,
-- scaling by square root of cos of latitude.
let latscale = SV.map (\lt -> realToFrac $ sqrt $ cos (lt / 180.0 * pi)) lat
z500 = build (ntime, nspace)
(\t s -> let it = truncate t :: Int
is = truncate s :: Int
(ilat, ilon) = divMod is nlon
i = Repa.Z Repa.:. it Repa.:. ilat Repa.:. ilon
in (latscale ! ilat) *
(fromIntegral $ z500short Repa.! i)) :: Matrix Double
-- Perform PCA.
let (varexp, evecs, projs) = pca z500
putStrLn $ show $ map (100 *) $ take 10 $ toList varexp
putStrLn $ show $ sum $ toList varexp
-- Set up output file.
let outlatdim = NcDim "lat" nlat False
outlatvar = NcVar "lat" NcFloat [outlatdim] (ncVarAttrs latvar)
outlondim = NcDim "lon" nlon False
outlonvar = NcVar "lon" NcFloat [outlondim] (ncVarAttrs lonvar)
outtimedim = NcDim "time" 0 True
outtimevar = NcVar "time" NcDouble [outtimedim] (ncVarAttrs timevar)
outpcdim = NcDim "pc" nspace False
outpcvar = NcVar "pc" NcInt [outpcdim] M.empty
outepatvar = NcVar "epat" NcDouble
[outpcdim, outlatdim, outlondim] M.empty
outvarexpvar = NcVar "varexp" NcDouble [outpcdim] M.empty
outprojvar = NcVar "proj" NcDouble [outtimedim, outpcdim] M.empty
outncinfo =
emptyNcInfo (outdir </> "z500-pca.nc") #
addNcDim outlatdim # addNcDim outlondim #
addNcDim outtimedim # addNcDim outpcdim #
addNcVar outlatvar # addNcVar outlonvar #
addNcVar outtimevar # addNcVar outpcvar #
addNcVar outepatvar # addNcVar outvarexpvar # addNcVar outprojvar
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $
\outnc -> do
-- Write coordinate variable values.
put outnc outlatvar lat
put outnc outlonvar lon
putA outnc outtimevar [0] [ntime] time
put outnc outpcvar $ (SV.enumFromN 0 nspace :: SV.Vector CInt)
let outvarexp =
SV.fromList $ map realToFrac $ toList varexp :: SV.Vector CDouble
put outnc outvarexpvar outvarexp
forM_ [0..nspace-1] $ \is -> do
let pat = cmap realToFrac $ reshape nlon $ evecs ! is :: Matrix CDouble
pastart = [is, 0, 0]
pacount = [1, nlat, nlon]
putA outnc outepatvar pastart pacount $ HMatrix pat
forM_ [0..ntime-1] $ \it -> do
let proj = cmap realToFrac $ projs ! it :: Vector CDouble
prstart = [it, 0]
prcount = [1, nspace]
putA outnc outprojvar prstart prcount $ HVector proj
return ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment