Last active
August 29, 2015 14:07
-
-
Save ian-ross/18425aa48e16e7dba02a to your computer and use it in GitHub Desktop.
Haskell data analysis: Z500 anomaly data PCA calculation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Control.Applicative ((<$>)) | |
import Control.Monad (forM_, foldM, when) | |
import System.FilePath | |
import qualified Data.Array.Repa as Repa | |
import qualified Data.Array.Repa.Eval as Repa | |
import Data.Array.Repa.Repr.ForeignPtr (F) | |
import Data.NetCDF | |
import Data.NetCDF.HMatrix | |
import Foreign.C | |
import qualified Data.Map as M | |
import qualified Data.NetCDF.Vector as V | |
import qualified Data.NetCDF.Repa as R | |
import qualified Data.Vector.Storable as SV | |
import Numeric.LinearAlgebra.HMatrix | |
import Data.Packed.Matrix (extractColumns) | |
import Debug.Trace | |
type SVRet a = IO (Either NcError (SV.Vector a)) | |
type FArray3 a = Repa.Array F Repa.DIM3 a | |
type FArray2 a = Repa.Array F Repa.DIM2 a | |
type RepaRet3 a = IO (Either NcError (FArray3 a)) | |
type RepaRet2 a = IO (Either NcError (FArray2 a)) | |
type RepaRet1 a = IO (Either NcError (Repa.Array F Repa.DIM1 a)) | |
basedir, indir, outdir :: FilePath | |
basedir = "/big/project-storage/haskell-data-analysis/atmos-non-diffusive" | |
indir = basedir </> "pre-processing" | |
outdir = basedir </> "pca" | |
type V = Vector Double | |
type M = Matrix Double | |
-- Offline calculation of mean and covariance of a set of data vectors. | |
meanCovM :: Int -> (Int -> IO V) -> IO (V, M) | |
meanCovM nrow readrow = do | |
-- Accumulate values for mean calculation. | |
refrow <- readrow 0 | |
let maddone acc i = do | |
row <- readrow i | |
return $! acc + row | |
mtotal <- foldM maddone refrow [1..nrow-1] | |
-- Calculate sample mean. | |
let mean = scale (1.0 / fromIntegral nrow) mtotal | |
-- Accumulate differences from mean for covariance calculation. | |
let refdiff = refrow - mean | |
caddone acc i = do | |
row <- readrow i | |
let diff = row - mean | |
return $! acc + (diff `outer` diff) | |
ctotal <- foldM caddone (refdiff `outer` refdiff) [1..nrow-1] | |
-- Calculate sample covariance. | |
let cov = scale (1.0 / fromIntegral (nrow - 1)) ctotal | |
return (mean, cov) | |
-- Perform PCA decomposition and project data points onto PCA | |
-- eigenvectors: return value is (data mean; PCA eigenvalues in | |
-- descending order; explained variance per PCA eigenvector; PCA | |
-- eigenvectors as rows of a matrix; projections of data points onto | |
-- PCA eigenvectors, one per row of a matrix). | |
-- | |
-- Here, the decomposition is done "offline", using IO actions to read | |
-- data rows and to write projected data outputs. | |
pcaM :: Int -> (Int -> IO V) -> (Int -> V -> IO ()) -> IO (V, M) | |
pcaM nrow readrow writeproj = do | |
(mean, cov) <- meanCovM nrow readrow | |
let (_, evals, evecCols) = svd cov | |
evecs = fromRows $ map evecSigns $ toColumns evecCols | |
evecSigns ev = let maxelidx = maxIndex $ cmap abs ev | |
sign = signum (ev ! maxelidx) | |
in cmap (sign *) ev | |
varexp = scale (1.0 / sumElements evals) evals | |
project x = evecs #> (x - mean) | |
forM_ [0..nrow-1] $ \r -> readrow r >>= writeproj r . project | |
return (varexp, evecs) | |
main :: IO () | |
main = do | |
-- Open anomaly data file for input. | |
Right innc <- openFile $ indir </> "z500-anomalies.nc" | |
let Just nlat = ncDimLength <$> ncDim innc "lat" | |
Just nlon = ncDimLength <$> ncDim innc "lon" | |
Just ntime = ncDimLength <$> ncDim innc "time" | |
nspace = nlat * nlon | |
let (Just lonvar) = ncVar innc "lon" | |
(Just latvar) = ncVar innc "lat" | |
(Just timevar) = ncVar innc "time" | |
(Just z500var) = ncVar innc "z500" | |
Right lon <- get innc lonvar :: SVRet CFloat | |
Right lat <- get innc latvar :: SVRet CFloat | |
Right time <- get innc timevar :: SVRet CDouble | |
-- Set up output file. | |
let outlatdim = NcDim "lat" nlat False | |
outlatvar = NcVar "lat" NcFloat [outlatdim] (ncVarAttrs latvar) | |
outlondim = NcDim "lon" nlon False | |
outlonvar = NcVar "lon" NcFloat [outlondim] (ncVarAttrs lonvar) | |
outtimedim = NcDim "time" 0 True | |
outtimevar = NcVar "time" NcDouble [outtimedim] (ncVarAttrs timevar) | |
outpcdim = NcDim "pc" nspace False | |
outpcvar = NcVar "pc" NcInt [outpcdim] M.empty | |
outepatvar = NcVar "epat" NcDouble | |
[outpcdim, outlatdim, outlondim] M.empty | |
outvarexpvar = NcVar "varexp" NcDouble [outpcdim] M.empty | |
outprojvar = NcVar "proj" NcDouble [outtimedim, outpcdim] M.empty | |
outncinfo = | |
emptyNcInfo (outdir </> "z500-pca-offline.nc") # | |
addNcDim outlatdim # addNcDim outlondim # | |
addNcDim outtimedim # addNcDim outpcdim # | |
addNcVar outlatvar # addNcVar outlonvar # | |
addNcVar outtimevar # addNcVar outpcvar # | |
addNcVar outepatvar # addNcVar outvarexpvar # addNcVar outprojvar | |
-- Data scaling based on square root of cos of latitude. | |
let latscale = SV.map (\lt -> realToFrac $ sqrt $ cos (lt / 180.0 * pi)) lat | |
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $ | |
\outnc -> do | |
-- Write coordinate variable values. | |
put outnc outlatvar lat | |
put outnc outlonvar lon | |
putA outnc outtimevar [0] [ntime] time | |
put outnc outpcvar $ (SV.enumFromN 0 nspace :: SV.Vector CInt) | |
-- Worker to read a single data row and convert anomaly data to | |
-- floating point values, scaling by square root of cos of | |
-- latitude. | |
let readrow :: Int -> IO V | |
readrow it = do | |
let start = [it, 0, 0] | |
count = [1, nlat, nlon] | |
Right z500short <- getA innc z500var start count :: RepaRet2 CShort | |
let z500 = build nspace | |
(\s -> let is = truncate s :: Int | |
(ilat, ilon) = divMod is nlon | |
i = Repa.Z Repa.:. ilat Repa.:. ilon | |
in (latscale ! ilat) * | |
(fromIntegral $ z500short Repa.! i)) | |
return z500 | |
-- Worker to write a single data row projection. | |
let writeproj it proj = do | |
let cproj = cmap realToFrac proj :: Vector CDouble | |
prstart = [it, 0] | |
prcount = [1, nspace] | |
putA outnc outprojvar prstart prcount $ HVector cproj | |
return () | |
-- Perform PCA. | |
(varexp, evecs) <- pcaM ntime readrow writeproj | |
putStrLn $ show $ map (100 *) $ take 10 $ toList varexp | |
putStrLn $ show $ sum $ toList varexp | |
-- Write explained variance and PCA eigenpatterns. | |
let outvarexp = | |
SV.fromList $ map realToFrac $ toList varexp :: SV.Vector CDouble | |
put outnc outvarexpvar outvarexp | |
forM_ [0..nspace-1] $ \is -> do | |
let pat = cmap realToFrac $ reshape nlon $ evecs ! is :: Matrix CDouble | |
pastart = [is, 0, 0] | |
pacount = [1, nlat, nlon] | |
putA outnc outepatvar pastart pacount $ HMatrix pat | |
return () |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Control.Applicative ((<$>)) | |
import Control.Monad (forM_) | |
import System.FilePath | |
import qualified Data.Array.Repa as Repa | |
import qualified Data.Array.Repa.Eval as Repa | |
import Data.Array.Repa.Repr.ForeignPtr (F) | |
import Data.NetCDF | |
import Data.NetCDF.HMatrix | |
import Foreign.C | |
import qualified Data.Map as M | |
import qualified Data.NetCDF.Vector as V | |
import qualified Data.NetCDF.Repa as R | |
import qualified Data.Vector.Storable as SV | |
import Numeric.LinearAlgebra.HMatrix | |
import Data.Packed.Matrix (extractColumns) | |
import Debug.Trace | |
type SVRet a = IO (Either NcError (SV.Vector a)) | |
type FArray3 a = Repa.Array F Repa.DIM3 a | |
type FArray2 a = Repa.Array F Repa.DIM2 a | |
type RepaRet3 a = IO (Either NcError (FArray3 a)) | |
type RepaRet2 a = IO (Either NcError (FArray2 a)) | |
type RepaRet1 a = IO (Either NcError (Repa.Array F Repa.DIM1 a)) | |
basedir, indir, outdir :: FilePath | |
basedir = "/big/project-storage/haskell-data-analysis/atmos-non-diffusive" | |
indir = basedir </> "pre-processing" | |
outdir = basedir </> "pca" | |
-- Perform PCA decomposition and project data points onto PCA | |
-- eigenvectors: return value is (data mean; PCA eigenvalues in | |
-- descending order; explained variance per PCA eigenvector; PCA | |
-- eigenvectors as rows of a matrix; projections of data points onto | |
-- PCA eigenvectors, one per row of a matrix). | |
pca :: Matrix Double -> (Vector Double, Matrix Double, Matrix Double) | |
pca xs = (varexp, evecs, projs) | |
where (mean, cov) = meanCov xs | |
(_, evals, evecCols) = svd cov | |
evecs = fromRows $ map evecSigns $ toColumns evecCols | |
evecSigns ev = let maxelidx = maxIndex $ cmap abs ev | |
sign = signum (ev ! maxelidx) | |
in cmap (sign *) ev | |
varexp = scale (1.0 / sumElements evals) evals | |
projs = fromRows $ map project $ toRows xs | |
project x = evecs #> (x - mean) | |
main :: IO () | |
main = do | |
-- Open anomaly data file for input. | |
Right innc <- openFile $ indir </> "z500-anomalies.nc" | |
let Just nlat = ncDimLength <$> ncDim innc "lat" | |
Just nlon = ncDimLength <$> ncDim innc "lon" | |
Just ntime = ncDimLength <$> ncDim innc "time" | |
nspace = nlat * nlon | |
let (Just lonvar) = ncVar innc "lon" | |
(Just latvar) = ncVar innc "lat" | |
(Just timevar) = ncVar innc "time" | |
(Just z500var) = ncVar innc "z500" | |
Right lon <- get innc lonvar :: SVRet CFloat | |
Right lat <- get innc latvar :: SVRet CFloat | |
Right time <- get innc timevar :: SVRet CDouble | |
Right z500short <- get innc z500var :: RepaRet3 CShort | |
-- Convert anomaly data to a matrix of floating point values, | |
-- scaling by square root of cos of latitude. | |
let latscale = SV.map (\lt -> realToFrac $ sqrt $ cos (lt / 180.0 * pi)) lat | |
z500 = build (ntime, nspace) | |
(\t s -> let it = truncate t :: Int | |
is = truncate s :: Int | |
(ilat, ilon) = divMod is nlon | |
i = Repa.Z Repa.:. it Repa.:. ilat Repa.:. ilon | |
in (latscale ! ilat) * | |
(fromIntegral $ z500short Repa.! i)) :: Matrix Double | |
-- Perform PCA. | |
let (varexp, evecs, projs) = pca z500 | |
putStrLn $ show $ map (100 *) $ take 10 $ toList varexp | |
putStrLn $ show $ sum $ toList varexp | |
-- Set up output file. | |
let outlatdim = NcDim "lat" nlat False | |
outlatvar = NcVar "lat" NcFloat [outlatdim] (ncVarAttrs latvar) | |
outlondim = NcDim "lon" nlon False | |
outlonvar = NcVar "lon" NcFloat [outlondim] (ncVarAttrs lonvar) | |
outtimedim = NcDim "time" 0 True | |
outtimevar = NcVar "time" NcDouble [outtimedim] (ncVarAttrs timevar) | |
outpcdim = NcDim "pc" nspace False | |
outpcvar = NcVar "pc" NcInt [outpcdim] M.empty | |
outepatvar = NcVar "epat" NcDouble | |
[outpcdim, outlatdim, outlondim] M.empty | |
outvarexpvar = NcVar "varexp" NcDouble [outpcdim] M.empty | |
outprojvar = NcVar "proj" NcDouble [outtimedim, outpcdim] M.empty | |
outncinfo = | |
emptyNcInfo (outdir </> "z500-pca.nc") # | |
addNcDim outlatdim # addNcDim outlondim # | |
addNcDim outtimedim # addNcDim outpcdim # | |
addNcVar outlatvar # addNcVar outlonvar # | |
addNcVar outtimevar # addNcVar outpcvar # | |
addNcVar outepatvar # addNcVar outvarexpvar # addNcVar outprojvar | |
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $ | |
\outnc -> do | |
-- Write coordinate variable values. | |
put outnc outlatvar lat | |
put outnc outlonvar lon | |
putA outnc outtimevar [0] [ntime] time | |
put outnc outpcvar $ (SV.enumFromN 0 nspace :: SV.Vector CInt) | |
let outvarexp = | |
SV.fromList $ map realToFrac $ toList varexp :: SV.Vector CDouble | |
put outnc outvarexpvar outvarexp | |
forM_ [0..nspace-1] $ \is -> do | |
let pat = cmap realToFrac $ reshape nlon $ evecs ! is :: Matrix CDouble | |
pastart = [is, 0, 0] | |
pacount = [1, nlat, nlon] | |
putA outnc outepatvar pastart pacount $ HMatrix pat | |
forM_ [0..ntime-1] $ \it -> do | |
let proj = cmap realToFrac $ projs ! it :: Vector CDouble | |
prstart = [it, 0] | |
prcount = [1, nspace] | |
putA outnc outprojvar prstart prcount $ HVector proj | |
return () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment