shackenberg · September 9, 2013 11:54
diff --git a/forward.py b/forward.py
 from numpy import zeros, dot, exp, tanh, array, allclose
 from numpy.random import randn
 from copy import deepcopy
 from time import time
 from theano import tensor as T
 from theano import function, shared, config, scan

 FLOAT_PRECISION = config.floatX

 class Network:
    def __init__(self,ni,ns,initial=0.1,maxlen=2500):
        na = 1+ni+ns
        self.dims = ni,ns,na
        self.init_variables(initial)
    def init_variables(self,initial,maxlen=2500):
        n = maxlen
        ni,ns,na = self.dims
        self.WGI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
        self.WGO = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
        self.WCI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
        self.source = array(zeros([n,na]), dtype=FLOAT_PRECISION)
        self.cix = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
        self.ci = array(zeros([n,ns]), dtype=FLOAT_PRECISION) 
        self.gix = array(zeros([n,ns]), dtype=FLOAT_PRECISION) 
        self.gi = array(zeros([n,ns]), dtype=FLOAT_PRECISION) 
        self.gox = array(zeros([n,ns]), dtype=FLOAT_PRECISION) 
        self.go = array(zeros([n,ns]), dtype=FLOAT_PRECISION) 
        self.state = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
        self.output = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
    
    def forward(self,xs):
        def ffunc(x):
            return 1.0/(1.0+exp(-x))    
        ni,ns,na = self.dims
        prev = zeros(ns)
        n = len(xs)
        for t in range(n):
            self.source[t,0] = 1
            self.source[t,1:1+ni] = xs[t]
            self.source[t,1+ni:] = prev
            dot(self.WGI,self.source[t],out=self.gix[t])
            dot(self.WGO,self.source[t],out=self.gox[t])
            dot(self.WCI,self.source[t],out=self.cix[t])
            self.gi[t] = ffunc(self.gix[t])
            self.ci[t] = tanh(self.cix[t])
            self.state[t] = self.ci[t]*self.gi[t]
            self.go[t] = ffunc(self.gox[t])
            self.output[t] = tanh(self.state[t]) * self.go[t]
            prev = self.output[t]
        return self.output[:n]


 class Network_Theano_Scan():
    def __init__(self, original_net, ni,ns,maxlen=2500):
        na = 1+ni+ns
        self.copy_weights(original_net)
        # to make sure, both networks produce the same results
        self.uploadweightsTheano()
        self.initforwardTheano(ns, maxlen, na)

    def copy_weights(self, original_net):
        self.WGI = deepcopy(original_net.WGI)
        self.WGO = deepcopy(original_net.WGO)
        self.WCI = deepcopy(original_net.WCI)
    
    def uploadweightsTheano(self):
        self.TWGI_shared = shared(self.WGI)
        self.TWGO_shared = shared(self.WGO)
        self.TWCI_shared = shared(self.WCI)

    def initforwardTheano(self, ns, n, na):
        def Tffunc(x):
            Tone = array([1.0], dtype=FLOAT_PRECISION)    
            return Tone/(Tone+T.exp(-x))

        self.Toutput = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
        self.Tgo_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
        self.Txs_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
        Tone = array([1.0], dtype=FLOAT_PRECISION)
        Tn = T.iscalar('Tn')
        Ta = T.arange(Tn)   
        def step(Tt, Tprev_output):
            Txs = self.Txs_shared[Tt]
            Tsource = T.concatenate([Tone, Txs, Tprev_output])
            Tgix = T.dot(self.TWGI_shared, Tsource)
            Tgox = T.dot(self.TWGO_shared, Tsource)
            Tcix = T.dot(self.TWCI_shared, Tsource)
            Tgi = Tone/(Tone+T.exp(-Tgix))
            Tci = T.tanh(Tcix)            
            Tstate = Tci * Tgi
            Tgo = Tffunc(Tgox)            
            output = T.tanh(Tstate) * Tgo
            return output

        Toutput0 = T.vector('Toutput0')
        Toutput0 = T.zeros([ns])
        Toutput, _ = scan(step,
                         sequences=[Ta],
                         outputs_info=[Toutput0],
                         non_sequences=[])
        
        TToutput = (self.Toutput, T.set_subtensor(self.Toutput[:Tn], Toutput))        
        updates = [TToutput]
        self.Tforward = function([Tn], outputs=[], updates=updates)

    def forward(self, xs):
        n = len(xs)
        self.Tforward(n)
 # init
 ninput = 48
 nstates = 100
 seqlength = 1000

 network_orig = Network(ninput, nstates)
 network_theano_scan = Network_Theano_Scan(network_orig, ninput, nstates)
 data = array(randn(seqlength, ninput), dtype=FLOAT_PRECISION)

 # numpy 
 starttime = time()
 output = network_orig.forward(data)
 print "nympy takes {}s".format(time() - starttime)


 # theano_scan
 network_theano_scan.Txs_shared.set_value(data)
 starttime = time()
 network_theano_scan.forward(data)
 print "theano scan takes {}s".format(time() - starttime)
 output_theano_scan = network_theano_scan.Toutput.get_value()[:seqlength]

 #check the results
 rtol=1e-04
 atol=1e-05
 if not allclose(output, output_theano_scan, rtol, atol):
    import pdb; pdb.set_trace()    
 print "pass: theano scan code"
diff --git a/profile=True b/profile=True
 With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile=True

 Function profiling
 ==================
  Message: None
  Time in 1 calls to Function.__call__: 3.572400e-01s
  Time in Function.fn.__call__: 3.571680e-01s (99.980%)
  Time in thunks: 3.571451e-01s (99.973%)
  Total compile time: 7.711120e-01s
    Theano Optimizer time: 5.242310e-01s
       Theano validate time: 1.065993e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.425768e-01s

 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  99.9%    99.9%       0.357s       3.57e-01s     Py       1        1   <class 'theano.scan_module.scan_op.Scan'>
   0.1%   100.0%       0.000s       1.99e-04s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
   0.0%   100.0%       0.000s       3.81e-06s     C       11       11   <class 'theano.tensor.elemwise.Elemwise'>
   0.0%   100.0%       0.000s       3.22e-05s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
   0.0%   100.0%       0.000s       1.50e-05s     Py       1        1   <class 'theano.tensor.basic.ARange'>
   0.0%   100.0%       0.000s       5.96e-06s     C        1        1   <class 'theano.tensor.subtensor.Subtensor'>
   0.0%   100.0%       0.000s       9.54e-07s     C        5        5   <class 'theano.tensor.basic.ScalarFromTensor'>
   0.0%   100.0%       0.000s       4.05e-06s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  99.9%    99.9%       0.357s       3.57e-01s     Py       1        1   forall_inplace,gpu,scan_fn}
   0.1%   100.0%       0.000s       1.99e-04s     C        1        1   GpuAlloc{memset_0=True}
   0.0%   100.0%       0.000s       3.22e-05s     C        1        1   GpuIncSubtensor{InplaceSet;:int32:}
   0.0%   100.0%       0.000s       1.50e-05s     Py       1        1   ARange
   0.0%   100.0%       0.000s       3.46e-06s     C        2        2   Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
   0.0%   100.0%       0.000s       6.91e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
   0.0%   100.0%       0.000s       5.96e-06s     C        1        1   Subtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
   0.0%   100.0%       0.000s       9.54e-07s     C        5        5   ScalarFromTensor
   0.0%   100.0%       0.000s       4.77e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
   0.0%   100.0%       0.000s       4.05e-06s     C        1        1   GpuSubtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       3.10e-06s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
   0.0%   100.0%       0.000s       3.10e-06s     C        1        1   Elemwise{Cast{int64}}
   0.0%   100.0%       0.000s       2.86e-06s     C        1        1   Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
   0.0%   100.0%       0.000s       2.15e-06s     C        1        1   Elemwise{add,no_inplace}
   0.0%   100.0%       0.000s       2.15e-06s     C        1        1   Elemwise{le,no_inplace}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  99.9%    99.9%       0.357s       3.57e-01s      1    19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
   0.1%   100.0%       0.000s       1.99e-04s      1    15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[  1.09508501e+21]]}, Elemwise{Compo
   0.0%   100.0%       0.000s       3.22e-05s      1    21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
   0.0%   100.0%       0.000s       1.50e-05s      1     2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
   0.0%   100.0%       0.000s       6.91e-06s      1     9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
   0.0%   100.0%       0.000s       5.96e-06s      1    11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
   0.0%   100.0%       0.000s       5.96e-06s      1    17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
   0.0%   100.0%       0.000s       5.01e-06s      1     4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3), 
   0.0%   100.0%       0.000s       5.01e-06s      1     7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
   0.0%   100.0%       0.000s       4.77e-06s      1     8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp
   0.0%   100.0%       0.000s       4.05e-06s      1    20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0,
   0.0%   100.0%       0.000s       3.10e-06s      1    10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca
   0.0%   100.0%       0.000s       3.10e-06s      1     1 Elemwise{Cast{int64}}(Tn)
   0.0%   100.0%       0.000s       2.86e-06s      1     5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add
   0.0%   100.0%       0.000s       2.15e-06s      1     3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0)
   0.0%   100.0%       0.000s       2.15e-06s      1     6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i
   0.0%   100.0%       0.000s       9.54e-07s      1    14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
   0.0%   100.0%       0.000s       9.54e-07s      1    16 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
   0.0%   100.0%       0.000s       9.54e-07s      1    18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
   0.0%   100.0%       0.000s       9.54e-07s      1    13 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}.0)
   ... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime)


 Scan Op profiling ( scan_fn )
 ==================
  Message: None
  Time in 1 calls of the op (for a total of 1000 steps) 3.567450e-01s

  Total time spent in calling the VM 2.880974e-01s (80.757%)
  Total overhead (computing slices..) 6.864762e-02s (19.243%)

 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  36.8%    36.8%       0.092s       3.06e-05s     C     3000        3   <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
  29.1%    65.9%       0.072s       7.24e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
  24.4%    90.3%       0.061s       2.02e-05s     C     3000        3   <class 'theano.sandbox.cuda.blas.GpuGemv'>
   7.6%    97.8%       0.019s       1.88e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
   1.2%    99.0%       0.003s       9.77e-07s     C     3000        3   <class 'theano.tensor.opt.Shape_i'>
   0.7%    99.7%       0.002s       1.81e-06s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
   0.3%   100.0%       0.001s       7.09e-07s     C     1000        1   <class 'theano.tensor.basic.ScalarFromTensor'>
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  36.8%    36.8%       0.092s       3.06e-05s     C     3000        3   GpuAlloc{memset_0=True}
  29.1%    65.9%       0.072s       7.24e-05s     C     1000        1   GpuJoin
  24.4%    90.3%       0.061s       2.02e-05s     C     3000        3   GpuGemv{inplace}
   7.6%    97.8%       0.019s       1.88e-05s     C     1000        1   GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
   1.2%    99.0%       0.003s       9.77e-07s     C     3000        3   Shape_i{0}
   0.7%    99.7%       0.002s       1.81e-06s     C     1000        1   GpuSubtensor{int32}
   0.3%   100.0%       0.001s       7.09e-07s     C     1000        1   ScalarFromTensor
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  29.1%    29.1%       0.072s       7.24e-05s   1000     8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[  1.34258989e+13]}, GpuSubtensor{
  12.5%    41.5%       0.031s       3.10e-05s   1000     6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
  12.4%    53.9%       0.031s       3.09e-05s   1000     7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
  12.0%    65.9%       0.030s       2.98e-05s   1000     5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
   8.3%    74.2%       0.021s       2.08e-05s   1000    10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
   8.1%    82.4%       0.020s       2.03e-05s   1000     9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
   7.9%    90.3%       0.020s       1.97e-05s   1000    11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
   7.6%    97.8%       0.019s       1.88e-05s   1000    12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
   0.7%    98.5%       0.002s       1.81e-06s   1000     4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
   0.4%    99.0%       0.001s       1.10e-06s   1000     2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
   0.4%    99.4%       0.001s       9.84e-07s   1000     1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
   0.3%    99.7%       0.001s       8.51e-07s   1000     3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
   0.3%   100.0%       0.001s       7.09e-07s   1000     0 ScalarFromTensor(<TensorType(int32, scalar)>)
   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)

 Function profiling
 ==================
  Message: Sum of all printed profiles at exit
  Time in 1 calls to Function.__call__: 3.572400e-01s
  Time in Function.fn.__call__: 6.452653e-01s (180.625%)
  Time in thunks: 6.063292e-01s (169.726%)
  Total compile time: 9.907391e-01s
    Theano Optimizer time: 7.288530e-01s
       Theano validate time: 2.013111e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.546458e-01s

 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  58.9%    58.9%       0.357s       3.57e-01s     Py       1        1   <class 'theano.scan_module.scan_op.Scan'>
  15.2%    74.0%       0.092s       3.06e-05s     C     3001        4   <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
  11.9%    86.0%       0.072s       7.24e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
  10.0%    96.0%       0.061s       2.02e-05s     C     3000        3   <class 'theano.sandbox.cuda.blas.GpuGemv'>
   3.1%    99.1%       0.019s       1.88e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
   0.5%    99.6%       0.003s       9.77e-07s     C     3000        3   <class 'theano.tensor.opt.Shape_i'>
   0.3%    99.9%       0.002s       1.81e-06s     C     1001        2   <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
   0.1%   100.0%       0.001s       7.10e-07s     C     1005        6   <class 'theano.tensor.basic.ScalarFromTensor'>
   0.0%   100.0%       0.000s       3.81e-06s     C       11       11   <class 'theano.tensor.elemwise.Elemwise'>
   0.0%   100.0%       0.000s       3.22e-05s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
   0.0%   100.0%       0.000s       1.50e-05s     Py       1        1   <class 'theano.tensor.basic.ARange'>
   0.0%   100.0%       0.000s       5.96e-06s     C        1        1   <class 'theano.tensor.subtensor.Subtensor'>
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  58.9%    58.9%       0.357s       3.57e-01s     Py       1        1   forall_inplace,gpu,scan_fn}
  15.2%    74.0%       0.092s       3.06e-05s     C     3001        4   GpuAlloc{memset_0=True}
  11.9%    86.0%       0.072s       7.24e-05s     C     1000        1   GpuJoin
  10.0%    96.0%       0.061s       2.02e-05s     C     3000        3   GpuGemv{inplace}
   3.1%    99.1%       0.019s       1.88e-05s     C     1000        1   GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
   0.5%    99.6%       0.003s       9.77e-07s     C     3000        3   Shape_i{0}
   0.3%    99.9%       0.002s       1.81e-06s     C     1000        1   GpuSubtensor{int32}
   0.1%   100.0%       0.001s       7.10e-07s     C     1005        6   ScalarFromTensor
   0.0%   100.0%       0.000s       3.22e-05s     C        1        1   GpuIncSubtensor{InplaceSet;:int32:}
   0.0%   100.0%       0.000s       1.50e-05s     Py       1        1   ARange
   0.0%   100.0%       0.000s       3.46e-06s     C        2        2   Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
   0.0%   100.0%       0.000s       6.91e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
   0.0%   100.0%       0.000s       5.96e-06s     C        1        1   Subtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
   0.0%   100.0%       0.000s       4.77e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
   0.0%   100.0%       0.000s       4.05e-06s     C        1        1   GpuSubtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       3.10e-06s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
   0.0%   100.0%       0.000s       3.10e-06s     C        1        1   Elemwise{Cast{int64}}
   0.0%   100.0%       0.000s       2.86e-06s     C        1        1   Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
   ... (remaining 2 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  58.9%    58.9%       0.357s       3.57e-01s      1    19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
  11.9%    70.8%       0.072s       7.24e-05s   1000     8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[  1.34258989e+13]}, GpuSubtensor{
   5.1%    75.9%       0.031s       3.10e-05s   1000     6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
   5.1%    81.0%       0.031s       3.09e-05s   1000     7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
   4.9%    85.9%       0.030s       2.98e-05s   1000     5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
   3.4%    89.4%       0.021s       2.08e-05s   1000    10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
   3.3%    92.7%       0.020s       2.03e-05s   1000     9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
   3.2%    95.9%       0.020s       1.97e-05s   1000    11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
   3.1%    99.1%       0.019s       1.88e-05s   1000    12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
   0.3%    99.3%       0.002s       1.81e-06s   1000     4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
   0.2%    99.5%       0.001s       1.10e-06s   1000     2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
   0.2%    99.7%       0.001s       9.84e-07s   1000     1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
   0.1%    99.8%       0.001s       8.51e-07s   1000     3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
   0.1%   100.0%       0.001s       7.09e-07s   1000     0 ScalarFromTensor(<TensorType(int32, scalar)>)
   0.0%   100.0%       0.000s       1.99e-04s      1    15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[  1.09508501e+21]]}, Elemwise{Compo
   0.0%   100.0%       0.000s       3.22e-05s      1    21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
   0.0%   100.0%       0.000s       1.50e-05s      1     2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
   0.0%   100.0%       0.000s       6.91e-06s      1     9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
   0.0%   100.0%       0.000s       5.96e-06s      1    11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
   0.0%   100.0%       0.000s       5.96e-06s      1    17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
   ... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime)
diff --git a/profile=True,profile_memory=True b/profile=True,profile_memory=True
 With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile_memory=True,profile=True

 Function profiling
 ==================
  Message: None
  Time in 1 calls to Function.__call__: 8.473389e-01s
  Time in Function.fn.__call__: 8.472540e-01s (99.990%)
  Time in thunks: 8.466234e-01s (99.916%)
  Total compile time: 7.679639e-01s
    Theano Optimizer time: 5.219090e-01s
       Theano validate time: 1.056147e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.417691e-01s

 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  100.0%   100.0%       0.846s       8.46e-01s     Py       1        1   <class 'theano.scan_module.scan_op.Scan'>
   0.0%   100.0%       0.000s       2.18e-04s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
   0.0%   100.0%       0.000s       6.44e-06s     C       11       11   <class 'theano.tensor.elemwise.Elemwise'>
   0.0%   100.0%       0.000s       3.81e-05s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
   0.0%   100.0%       0.000s       3.62e-06s     C        5        5   <class 'theano.tensor.basic.ScalarFromTensor'>
   0.0%   100.0%       0.000s       1.60e-05s     Py       1        1   <class 'theano.tensor.basic.ARange'>
   0.0%   100.0%       0.000s       1.00e-05s     C        1        1   <class 'theano.tensor.subtensor.Subtensor'>
   0.0%   100.0%       0.000s       8.11e-06s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  100.0%   100.0%       0.846s       8.46e-01s     Py       1        1   forall_inplace,gpu,scan_fn}
   0.0%   100.0%       0.000s       2.18e-04s     C        1        1   GpuAlloc{memset_0=True}
   0.0%   100.0%       0.000s       3.81e-05s     C        1        1   GpuIncSubtensor{InplaceSet;:int32:}
   0.0%   100.0%       0.000s       3.62e-06s     C        5        5   ScalarFromTensor
   0.0%   100.0%       0.000s       1.60e-05s     Py       1        1   ARange
   0.0%   100.0%       0.000s       5.36e-06s     C        2        2   Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
   0.0%   100.0%       0.000s       1.00e-05s     C        1        1   Subtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       1.00e-05s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
   0.0%   100.0%       0.000s       9.06e-06s     C        1        1   Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
   0.0%   100.0%       0.000s       9.06e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
   0.0%   100.0%       0.000s       8.11e-06s     C        1        1   GpuSubtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       8.11e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
   0.0%   100.0%       0.000s       5.96e-06s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{le,no_inplace}
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{Cast{int64}}
   0.0%   100.0%       0.000s       4.05e-06s     C        1        1   Elemwise{add,no_inplace}
   0.0%   100.0%       0.000s       3.81e-06s     C        1        1   Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  100.0%   100.0%       0.846s       8.46e-01s      1    19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int32, shape=(1000,), strides=c 
    input 2: dtype=float32, shape=(1000, 100), strides=(100, 1) 
    input 3: dtype=float32, shape=(100, 149), strides=c 
    input 4: dtype=float32, shape=(1000, 48), strides=c 
    input 5: dtype=float32, shape=(100, 149), strides=c 
    input 6: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) 
   0.0%   100.0%       0.000s       2.18e-04s      1    15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composi
    input 0: dtype=float32, shape=(1, 1), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) 
   0.0%   100.0%       0.000s       3.81e-05s      1    21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
    input 0: dtype=float32, shape=(2500, 100), strides=c 
    input 1: dtype=float32, shape=(1000, 100), strides=(100, 1) 
    input 2: dtype=int32, shape=4, strides=c 
    output 0: dtype=float32, shape=(2500, 100), strides=(100, 1) 
   0.0%   100.0%       0.000s       1.60e-05s      1     2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
    input 0: dtype=int8, shape=(), strides=c 
    input 1: dtype=int32, shape=(), strides=c 
    input 2: dtype=int8, shape=(), strides=c 
    output 0: dtype=int32, shape=(1000,), strides=c 
   0.0%   100.0%       0.000s       1.00e-05s      1     4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3), 
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int8, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       1.00e-05s      1    17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
    input 0: dtype=int32, shape=(1000,), strides=c 
    input 1: dtype=int64, shape=8, strides=c 
    input 2: dtype=int64, shape=8, strides=c 
    input 3: dtype=int8, shape=1, strides=c 
    output 0: dtype=int32, shape=(1000,), strides=c 
   0.0%   100.0%       0.000s       9.06e-06s      1     7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       9.06e-06s      1     9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
    input 0: dtype=int8, shape=(), strides=c 
    input 1: dtype=int8, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    input 3: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       8.11e-06s      1     8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp
    input 0: dtype=int8, shape=(), strides=c 
    input 1: dtype=int8, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    input 3: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       8.11e-06s      1    20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0,
    input 0: dtype=float32, shape=(1000, 100), strides=(100, 1) 
    input 1: dtype=int64, shape=8, strides=c 
    input 2: dtype=int64, shape=8, strides=c 
    input 3: dtype=int8, shape=1, strides=c 
    output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) 
   0.0%   100.0%       0.000s       7.87e-06s      1    14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    input 3: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       6.91e-06s      1     0 ScalarFromTensor(Tn)
    input 0: dtype=int32, shape=(), strides=c 
    output 0: dtype=int32, shape=4, strides=c 
   0.0%   100.0%       0.000s       5.96e-06s      1    10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int8, shape=(), strides=c 
    input 3: dtype=int64, shape=(), strides=c 
    input 4: dtype=int8, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       5.01e-06s      1     6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int8, shape=(), strides=c 
    output 0: dtype=int8, shape=(), strides=c 
   0.0%   100.0%       0.000s       5.01e-06s      1     1 Elemwise{Cast{int64}}(Tn)
    input 0: dtype=int32, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       4.05e-06s      1     3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0)
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       3.81e-06s      1     5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       3.10e-06s      1    18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
    input 0: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=8, strides=c 
   0.0%   100.0%       0.000s       3.10e-06s      1    12 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)].0
    input 0: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=8, strides=c 
   0.0%   100.0%       0.000s       2.86e-06s      1    11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    input 3: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   ... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime)

 Memory Profile
 (Sparse variables are ignored)
 ---
    Max if linker=cvm (default): unknown
    Max if no gc (allow_gc=False): 395KB
    Max if linker=c|py: 395KB
    Memory saved if gc is enabled (linker=c|py): 0KB

    <Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>

       1000000B  [(2500, 100)] i GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubtensor{int64:int64:int8}.0, ScalarFromTensor.0)
        400000B  [(1000, 100)] c GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Cast{int64}(i0), int_div(i1, i2))]}(NEQ(i0, i1), i2, i3)]}(mod(i0, i1), i2, i0, i1)]}(Composite{[Composite{[Composite{[sub(Switch(i0, i1, i2), i3)]}(LT(i0, i1), Switch(i2, i1, i3), Switch(i4, i0, i5), i1)]}(i0, i1, LT(i2, i1), i2, LT(i0, i3), i3)]}(i0, i1, i2, i3), i4, i1)]}(i0, i1, add(i0, i2), i2, i3)]}(maximum(i0, i1), i2, i3, i4)]}}.0, TensorConstant{100})
        400000B  [(1000, 100)] i forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>)
        400000B  [(1000, 100)] v GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
          4000B  [(1000,)] c ARange(TensorConstant{0}, Tn, TensorConstant{1})
          4000B  [(1000,)] v Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
   ... (remaining 16 Apply account for  117B/2208117B ((0.01%)) of the Apply with dense outputs sizes)

    <created/inplace/view> is taken from the Op's declaration.
    Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.


 Scan Op profiling ( scan_fn )
 ==================
  Message: None
  Time in 1 calls of the op (for a total of 1000 steps) 8.461490e-01s

  Total time spent in calling the VM 7.661736e-01s (90.548%)
  Total overhead (computing slices..) 7.997537e-02s (9.452%)

 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  37.2%    37.2%       0.131s       4.37e-05s     C     3000        3   <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
  25.3%    62.6%       0.089s       2.97e-05s     C     3000        3   <class 'theano.sandbox.cuda.blas.GpuGemv'>
  24.8%    87.3%       0.087s       8.73e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
   7.4%    94.8%       0.026s       2.62e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
   3.1%    97.9%       0.011s       3.61e-06s     C     3000        3   <class 'theano.tensor.opt.Shape_i'>
   1.1%    99.0%       0.004s       3.88e-06s     C     1000        1   <class 'theano.tensor.basic.ScalarFromTensor'>
   1.0%   100.0%       0.004s       3.65e-06s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  37.2%    37.2%       0.131s       4.37e-05s     C     3000        3   GpuAlloc{memset_0=True}
  25.3%    62.6%       0.089s       2.97e-05s     C     3000        3   GpuGemv{inplace}
  24.8%    87.3%       0.087s       8.73e-05s     C     1000        1   GpuJoin
   7.4%    94.8%       0.026s       2.62e-05s     C     1000        1   GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
   3.1%    97.9%       0.011s       3.61e-06s     C     3000        3   Shape_i{0}
   1.1%    99.0%       0.004s       3.88e-06s     C     1000        1   ScalarFromTensor
   1.0%   100.0%       0.004s       3.65e-06s     C     1000        1   GpuSubtensor{int32}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  24.8%    24.8%       0.087s       8.73e-05s   1000     8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[  1.34258989e+13]}, GpuSubtensor{
    input 0: dtype=int8, shape=(), strides=c 
    input 1: dtype=float32, shape=(1,), strides=c 
    input 2: dtype=float32, shape=(48,), strides=(1,) 
    input 3: dtype=float32, shape=(100,), strides=c 
    output 0: dtype=float32, shape=(149,), strides=(1,) 
  12.5%    37.3%       0.044s       4.41e-05s   1000     7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[  1.34258989e+13]}, Shape_i{0}.0)
    input 0: dtype=float32, shape=(1,), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
  12.5%    49.8%       0.044s       4.39e-05s   1000     6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[  1.34258989e+13]}, Shape_i{0}.0)
    input 0: dtype=float32, shape=(1,), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
  12.2%    62.0%       0.043s       4.31e-05s   1000     5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[  1.34258989e+13]}, Shape_i{0}.0)
    input 0: dtype=float32, shape=(1,), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   8.5%    70.5%       0.030s       2.99e-05s   1000     9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(100, 149), strides=c 
    input 3: dtype=float32, shape=(149,), strides=(1,) 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   8.4%    79.0%       0.030s       2.97e-05s   1000    10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(100, 149), strides=c 
    input 3: dtype=float32, shape=(149,), strides=(1,) 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   8.4%    87.3%       0.030s       2.96e-05s   1000    11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(100, 149), strides=c 
    input 3: dtype=float32, shape=(149,), strides=(1,) 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   7.4%    94.8%       0.026s       2.62e-05s   1000    12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(100,), strides=(1,) 
    input 2: dtype=float32, shape=(100,), strides=(1,) 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   1.1%    95.9%       0.004s       3.93e-06s   1000     1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
    input 0: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   1.1%    97.0%       0.004s       3.88e-06s   1000     0 ScalarFromTensor(<TensorType(int32, scalar)>)
    input 0: dtype=int32, shape=(), strides=c 
    output 0: dtype=int32, shape=4, strides=c 
   1.0%    98.0%       0.004s       3.65e-06s   1000     4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
    input 0: dtype=float32, shape=(1000, 48), strides=c 
    input 1: dtype=int32, shape=4, strides=c 
    output 0: dtype=float32, shape=(48,), strides=(1,) 
   1.0%    99.0%       0.003s       3.48e-06s   1000     2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
    input 0: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   1.0%   100.0%       0.003s       3.40e-06s   1000     3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
    input 0: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)

 Memory Profile
 (Sparse variables are ignored)
 ---
    Max if linker=cvm (default): unknown
    Max if no gc (allow_gc=False): 2KB
    Max if linker=c|py: 2KB
    Memory saved if gc is enabled (linker=c|py): 0KB

    <Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>

   ... (remaining 13 Apply account for 3616B/3616B ((100.00%)) of the Apply with dense outputs sizes)

    All Apply nodes have output sizes that take less than 1024B.
    <created/inplace/view> is taken from the Op's declaration.
    Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.

 Function profiling
 ==================
  Message: Sum of all printed profiles at exit
  Time in 1 calls to Function.__call__: 8.473389e-01s
  Time in Function.fn.__call__: 1.613428e+00s (190.411%)
  Time in thunks: 1.198847e+00s (141.484%)
  Total compile time: 9.870379e-01s
    Theano Optimizer time: 7.257051e-01s
       Theano validate time: 1.985645e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.541780e-01s

 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  70.6%    70.6%       0.846s       8.46e-01s     Py       1        1   <class 'theano.scan_module.scan_op.Scan'>
  11.0%    81.5%       0.131s       4.38e-05s     C     3001        4   <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
   7.4%    89.0%       0.089s       2.97e-05s     C     3000        3   <class 'theano.sandbox.cuda.blas.GpuGemv'>
   7.3%    96.3%       0.087s       8.73e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
   2.2%    98.5%       0.026s       2.62e-05s     C     1000        1   <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
   0.9%    99.4%       0.011s       3.61e-06s     C     3000        3   <class 'theano.tensor.opt.Shape_i'>
   0.3%    99.7%       0.004s       3.88e-06s     C     1005        6   <class 'theano.tensor.basic.ScalarFromTensor'>
   0.3%   100.0%       0.004s       3.65e-06s     C     1001        2   <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
   0.0%   100.0%       0.000s       6.44e-06s     C       11       11   <class 'theano.tensor.elemwise.Elemwise'>
   0.0%   100.0%       0.000s       3.81e-05s     C        1        1   <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
   0.0%   100.0%       0.000s       1.60e-05s     Py       1        1   <class 'theano.tensor.basic.ARange'>
   0.0%   100.0%       0.000s       1.00e-05s     C        1        1   <class 'theano.tensor.subtensor.Subtensor'>
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  70.6%    70.6%       0.846s       8.46e-01s     Py       1        1   forall_inplace,gpu,scan_fn}
  11.0%    81.5%       0.131s       4.38e-05s     C     3001        4   GpuAlloc{memset_0=True}
   7.4%    89.0%       0.089s       2.97e-05s     C     3000        3   GpuGemv{inplace}
   7.3%    96.3%       0.087s       8.73e-05s     C     1000        1   GpuJoin
   2.2%    98.5%       0.026s       2.62e-05s     C     1000        1   GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
   0.9%    99.4%       0.011s       3.61e-06s     C     3000        3   Shape_i{0}
   0.3%    99.7%       0.004s       3.88e-06s     C     1005        6   ScalarFromTensor
   0.3%   100.0%       0.004s       3.65e-06s     C     1000        1   GpuSubtensor{int32}
   0.0%   100.0%       0.000s       3.81e-05s     C        1        1   GpuIncSubtensor{InplaceSet;:int32:}
   0.0%   100.0%       0.000s       1.60e-05s     Py       1        1   ARange
   0.0%   100.0%       0.000s       5.36e-06s     C        2        2   Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
   0.0%   100.0%       0.000s       1.00e-05s     C        1        1   Subtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       1.00e-05s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
   0.0%   100.0%       0.000s       9.06e-06s     C        1        1   Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
   0.0%   100.0%       0.000s       9.06e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
   0.0%   100.0%       0.000s       8.11e-06s     C        1        1   GpuSubtensor{int64:int64:int8}
   0.0%   100.0%       0.000s       8.11e-06s     C        1        1   Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
   0.0%   100.0%       0.000s       5.96e-06s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{le,no_inplace}
   0.0%   100.0%       0.000s       5.01e-06s     C        1        1   Elemwise{Cast{int64}}
   ... (remaining 2 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  70.6%    70.6%       0.846s       8.46e-01s      1    19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int32, shape=(1000,), strides=c 
    input 2: dtype=float32, shape=(1000, 100), strides=(100, 1) 
    input 3: dtype=float32, shape=(100, 149), strides=c 
    input 4: dtype=float32, shape=(1000, 48), strides=c 
    input 5: dtype=float32, shape=(100, 149), strides=c 
    input 6: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) 
   7.3%    77.9%       0.087s       8.73e-05s   1000     8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[  1.34258989e+13]}, GpuSubtensor{
    input 0: dtype=int8, shape=(), strides=c 
    input 1: dtype=float32, shape=(1,), strides=c 
    input 2: dtype=float32, shape=(48,), strides=(1,) 
    input 3: dtype=float32, shape=(100,), strides=c 
    output 0: dtype=float32, shape=(149,), strides=(1,) 
   3.7%    81.5%       0.044s       4.41e-05s   1000     7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[  1.34258989e+13]}, Shape_i{0}.0)
    input 0: dtype=float32, shape=(1,), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   3.7%    85.2%       0.044s       4.39e-05s   1000     6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[  1.34258989e+13]}, Shape_i{0}.0)
    input 0: dtype=float32, shape=(1,), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   3.6%    88.8%       0.043s       4.31e-05s   1000     5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[  1.34258989e+13]}, Shape_i{0}.0)
    input 0: dtype=float32, shape=(1,), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   2.5%    91.3%       0.030s       2.99e-05s   1000     9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(100, 149), strides=c 
    input 3: dtype=float32, shape=(149,), strides=(1,) 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   2.5%    93.8%       0.030s       2.97e-05s   1000    10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(100, 149), strides=c 
    input 3: dtype=float32, shape=(149,), strides=(1,) 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   2.5%    96.3%       0.030s       2.96e-05s   1000    11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(), strides=c 
    input 2: dtype=float32, shape=(100, 149), strides=c 
    input 3: dtype=float32, shape=(149,), strides=(1,) 
    input 4: dtype=float32, shape=(), strides=c 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   2.2%    98.4%       0.026s       2.62e-05s   1000    12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
    input 0: dtype=float32, shape=(100,), strides=(1,) 
    input 1: dtype=float32, shape=(100,), strides=(1,) 
    input 2: dtype=float32, shape=(100,), strides=(1,) 
    output 0: dtype=float32, shape=(100,), strides=(1,) 
   0.3%    98.8%       0.004s       3.93e-06s   1000     1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
    input 0: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.3%    99.1%       0.004s       3.88e-06s   1000     0 ScalarFromTensor(<TensorType(int32, scalar)>)
    input 0: dtype=int32, shape=(), strides=c 
    output 0: dtype=int32, shape=4, strides=c 
   0.3%    99.4%       0.004s       3.65e-06s   1000     4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
    input 0: dtype=float32, shape=(1000, 48), strides=c 
    input 1: dtype=int32, shape=4, strides=c 
    output 0: dtype=float32, shape=(48,), strides=(1,) 
   0.3%    99.7%       0.003s       3.48e-06s   1000     2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
    input 0: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.3%   100.0%       0.003s       3.40e-06s   1000     3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
    input 0: dtype=float32, shape=(100, 149), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       2.18e-04s      1    15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composi
    input 0: dtype=float32, shape=(1, 1), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) 
   0.0%   100.0%       0.000s       3.81e-05s      1    21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
    input 0: dtype=float32, shape=(2500, 100), strides=c 
    input 1: dtype=float32, shape=(1000, 100), strides=(100, 1) 
    input 2: dtype=int32, shape=4, strides=c 
    output 0: dtype=float32, shape=(2500, 100), strides=(100, 1) 
   0.0%   100.0%       0.000s       1.60e-05s      1     2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
    input 0: dtype=int8, shape=(), strides=c 
    input 1: dtype=int32, shape=(), strides=c 
    input 2: dtype=int8, shape=(), strides=c 
    output 0: dtype=int32, shape=(1000,), strides=c 
   0.0%   100.0%       0.000s       1.00e-05s      1     4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3), 
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int8, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   0.0%   100.0%       0.000s       1.00e-05s      1    17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
    input 0: dtype=int32, shape=(1000,), strides=c 
    input 1: dtype=int64, shape=8, strides=c 
    input 2: dtype=int64, shape=8, strides=c 
    input 3: dtype=int8, shape=1, strides=c 
    output 0: dtype=int32, shape=(1000,), strides=c 
   0.0%   100.0%       0.000s       9.06e-06s      1     7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
    input 0: dtype=int64, shape=(), strides=c 
    input 1: dtype=int64, shape=(), strides=c 
    input 2: dtype=int64, shape=(), strides=c 
    output 0: dtype=int64, shape=(), strides=c 
   ... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime)

 Memory Profile (the max between all functions in that profile)
 (Sparse variables are ignored)
 ---
    Max if linker=cvm (default): unknown
    Max if no gc (allow_gc=False): 395KB
    Max if linker=c|py: 395KB
    Memory saved if gc is enabled (linker=c|py): 0KB

    This list is based on all functions in the profile
    <Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>

       1000000B  [(2500, 100)] i GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubtensor{int64:int64:int8}.0, ScalarFromTensor.0)
        400000B  [(1000, 100)] c GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Cast{int64}(i0), int_div(i1, i2))]}(NEQ(i0, i1), i2, i3)]}(mod(i0, i1), i2, i0, i1)]}(Composite{[Composite{[Composite{[sub(Switch(i0, i1, i2), i3)]}(LT(i0, i1), Switch(i2, i1, i3), Switch(i4, i0, i5), i1)]}(i0, i1, LT(i2, i1), i2, LT(i0, i3), i3)]}(i0, i1, i2, i3), i4, i1)]}(i0, i1, add(i0, i2), i2, i3)]}(maximum(i0, i1), i2, i3, i4)]}}.0, TensorConstant{100})
        400000B  [(1000, 100)] i forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>)
        400000B  [(1000, 100)] v GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
          4000B  [(1000,)] c ARange(TensorConstant{0}, Tn, TensorConstant{1})
          4000B  [(1000,)] v Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
   ... (remaining 29 Apply account for 3733B/2211733B ((0.17%)) of the Apply with dense outputs sizes)

    <created/inplace/view> is taken from the Op's declaration.
    Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.
	from numpy import zeros, dot, exp, tanh, array, allclose
	from numpy.random import randn
	from copy import deepcopy
	from time import time
	from theano import tensor as T
	from theano import function, shared, config, scan

	FLOAT_PRECISION = config.floatX

	class Network:
	def __init__(self,ni,ns,initial=0.1,maxlen=2500):
	na = 1+ni+ns
	self.dims = ni,ns,na
	self.init_variables(initial)
	def init_variables(self,initial,maxlen=2500):
	n = maxlen
	ni,ns,na = self.dims
	self.WGI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
	self.WGO = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
	self.WCI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
	self.source = array(zeros([n,na]), dtype=FLOAT_PRECISION)
	self.cix = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.ci = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.gix = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.gi = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.gox = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.go = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.state = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
	self.output = array(zeros([n,ns]), dtype=FLOAT_PRECISION)

	def forward(self,xs):
	def ffunc(x):
	return 1.0/(1.0+exp(-x))
	ni,ns,na = self.dims
	prev = zeros(ns)
	n = len(xs)
	for t in range(n):
	self.source[t,0] = 1
	self.source[t,1:1+ni] = xs[t]
	self.source[t,1+ni:] = prev
	dot(self.WGI,self.source[t],out=self.gix[t])
	dot(self.WGO,self.source[t],out=self.gox[t])
	dot(self.WCI,self.source[t],out=self.cix[t])
	self.gi[t] = ffunc(self.gix[t])
	self.ci[t] = tanh(self.cix[t])
	self.state[t] = self.ci[t]*self.gi[t]
	self.go[t] = ffunc(self.gox[t])
	self.output[t] = tanh(self.state[t]) * self.go[t]
	prev = self.output[t]
	return self.output[:n]


	class Network_Theano_Scan():
	def __init__(self, original_net, ni,ns,maxlen=2500):
	na = 1+ni+ns
	self.copy_weights(original_net)
	# to make sure, both networks produce the same results
	self.uploadweightsTheano()
	self.initforwardTheano(ns, maxlen, na)

	def copy_weights(self, original_net):
	self.WGI = deepcopy(original_net.WGI)
	self.WGO = deepcopy(original_net.WGO)
	self.WCI = deepcopy(original_net.WCI)

	def uploadweightsTheano(self):
	self.TWGI_shared = shared(self.WGI)
	self.TWGO_shared = shared(self.WGO)
	self.TWCI_shared = shared(self.WCI)

	def initforwardTheano(self, ns, n, na):
	def Tffunc(x):
	Tone = array([1.0], dtype=FLOAT_PRECISION)
	return Tone/(Tone+T.exp(-x))

	self.Toutput = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
	self.Tgo_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
	self.Txs_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
	Tone = array([1.0], dtype=FLOAT_PRECISION)
	Tn = T.iscalar('Tn')
	Ta = T.arange(Tn)
	def step(Tt, Tprev_output):
	Txs = self.Txs_shared[Tt]
	Tsource = T.concatenate([Tone, Txs, Tprev_output])
	Tgix = T.dot(self.TWGI_shared, Tsource)
	Tgox = T.dot(self.TWGO_shared, Tsource)
	Tcix = T.dot(self.TWCI_shared, Tsource)
	Tgi = Tone/(Tone+T.exp(-Tgix))
	Tci = T.tanh(Tcix)
	Tstate = Tci * Tgi
	Tgo = Tffunc(Tgox)
	output = T.tanh(Tstate) * Tgo
	return output

	Toutput0 = T.vector('Toutput0')
	Toutput0 = T.zeros([ns])
	Toutput, _ = scan(step,
	sequences=[Ta],
	outputs_info=[Toutput0],
	non_sequences=[])

	TToutput = (self.Toutput, T.set_subtensor(self.Toutput[:Tn], Toutput))
	updates = [TToutput]
	self.Tforward = function([Tn], outputs=[], updates=updates)

	def forward(self, xs):
	n = len(xs)
	self.Tforward(n)
	# init
	ninput = 48
	nstates = 100
	seqlength = 1000

	network_orig = Network(ninput, nstates)
	network_theano_scan = Network_Theano_Scan(network_orig, ninput, nstates)
	data = array(randn(seqlength, ninput), dtype=FLOAT_PRECISION)

	# numpy
	starttime = time()
	output = network_orig.forward(data)
	print "nympy takes {}s".format(time() - starttime)


	# theano_scan
	network_theano_scan.Txs_shared.set_value(data)
	starttime = time()
	network_theano_scan.forward(data)
	print "theano scan takes {}s".format(time() - starttime)
	output_theano_scan = network_theano_scan.Toutput.get_value()[:seqlength]

	#check the results
	rtol=1e-04
	atol=1e-05
	if not allclose(output, output_theano_scan, rtol, atol):
	import pdb; pdb.set_trace()
	print "pass: theano scan code"
	With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile=True

	Function profiling
	==================
	Message: None
	Time in 1 calls to Function.__call__: 3.572400e-01s
	Time in Function.fn.__call__: 3.571680e-01s (99.980%)
	Time in thunks: 3.571451e-01s (99.973%)
	Total compile time: 7.711120e-01s
	Theano Optimizer time: 5.242310e-01s
	Theano validate time: 1.065993e-02s
	Theano Linker time (includes C, CUDA code generation/compiling): 2.425768e-01s

	Class
	---
	<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
	99.9% 99.9% 0.357s 3.57e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'>
	0.1% 100.0% 0.000s 1.99e-04s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
	0.0% 100.0% 0.000s 3.81e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'>
	0.0% 100.0% 0.000s 3.22e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
	0.0% 100.0% 0.000s 1.50e-05s Py 1 1 <class 'theano.tensor.basic.ARange'>
	0.0% 100.0% 0.000s 5.96e-06s C 1 1 <class 'theano.tensor.subtensor.Subtensor'>
	0.0% 100.0% 0.000s 9.54e-07s C 5 5 <class 'theano.tensor.basic.ScalarFromTensor'>
	0.0% 100.0% 0.000s 4.05e-06s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
	... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)

	Ops
	---
	<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
	99.9% 99.9% 0.357s 3.57e-01s Py 1 1 forall_inplace,gpu,scan_fn}
	0.1% 100.0% 0.000s 1.99e-04s C 1 1 GpuAlloc{memset_0=True}
	0.0% 100.0% 0.000s 3.22e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:}
	0.0% 100.0% 0.000s 1.50e-05s Py 1 1 ARange
	0.0% 100.0% 0.000s 3.46e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
	0.0% 100.0% 0.000s 6.91e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
	0.0% 100.0% 0.000s 5.96e-06s C 1 1 Subtensor{int64:int64:int8}
	0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
	0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
	0.0% 100.0% 0.000s 9.54e-07s C 5 5 ScalarFromTensor
	0.0% 100.0% 0.000s 4.77e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
	0.0% 100.0% 0.000s 4.05e-06s C 1 1 GpuSubtensor{int64:int64:int8}
	0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
	0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Cast{int64}}
	0.0% 100.0% 0.000s 2.86e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
	0.0% 100.0% 0.000s 2.15e-06s C 1 1 Elemwise{add,no_inplace}
	0.0% 100.0% 0.000s 2.15e-06s C 1 1 Elemwise{le,no_inplace}
	... (remaining 0 Ops account for 0.00%(0.00s) of the runtime)

	Apply
	------
	<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
	99.9% 99.9% 0.357s 3.57e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
	0.1% 100.0% 0.000s 1.99e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 1.09508501e+21]]}, Elemwise{Compo
	0.0% 100.0% 0.000s 3.22e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
	0.0% 100.0% 0.000s 1.50e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
	0.0% 100.0% 0.000s 6.91e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
	0.0% 100.0% 0.000s 5.96e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
	0.0% 100.0% 0.000s 5.96e-06s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
	0.0% 100.0% 0.000s 5.01e-06s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3),
	0.0% 100.0% 0.000s 5.01e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
	0.0% 100.0% 0.000s 4.77e-06s 1 8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp
	0.0% 100.0% 0.000s 4.05e-06s 1 20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0,
	0.0% 100.0% 0.000s 3.10e-06s 1 10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca
	0.0% 100.0% 0.000s 3.10e-06s 1 1 Elemwise{Cast{int64}}(Tn)
	0.0% 100.0% 0.000s 2.86e-06s 1 5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add
	0.0% 100.0% 0.000s 2.15e-06s 1 3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0)
	0.0% 100.0% 0.000s 2.15e-06s 1 6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i
	0.0% 100.0% 0.000s 9.54e-07s 1 14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
	0.0% 100.0% 0.000s 9.54e-07s 1 16 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
	0.0% 100.0% 0.000s 9.54e-07s 1 18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
	0.0% 100.0% 0.000s 9.54e-07s 1 13 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}.0)
	... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime)


	Scan Op profiling ( scan_fn )
	==================
	Message: None
	Time in 1 calls of the op (for a total of 1000 steps) 3.567450e-01s

	Total time spent in calling the VM 2.880974e-01s (80.757%)
	Total overhead (computing slices..) 6.864762e-02s (19.243%)

	Class
	---
	<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
	36.8% 36.8% 0.092s 3.06e-05s C 3000 3 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
	29.1% 65.9% 0.072s 7.24e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
	24.4% 90.3% 0.061s 2.02e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'>
	7.6% 97.8% 0.019s 1.88e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
	1.2% 99.0% 0.003s 9.77e-07s C 3000 3 <class 'theano.tensor.opt.Shape_i'>
	0.7% 99.7% 0.002s 1.81e-06s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
	0.3% 100.0% 0.001s 7.09e-07s C 1000 1 <class 'theano.tensor.basic.ScalarFromTensor'>
	... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)

	Ops
	---
	<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
	36.8% 36.8% 0.092s 3.06e-05s C 3000 3 GpuAlloc{memset_0=True}
	29.1% 65.9% 0.072s 7.24e-05s C 1000 1 GpuJoin
	24.4% 90.3% 0.061s 2.02e-05s C 3000 3 GpuGemv{inplace}
	7.6% 97.8% 0.019s 1.88e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
	1.2% 99.0% 0.003s 9.77e-07s C 3000 3 Shape_i{0}
	0.7% 99.7% 0.002s 1.81e-06s C 1000 1 GpuSubtensor{int32}
	0.3% 100.0% 0.001s 7.09e-07s C 1000 1 ScalarFromTensor
	... (remaining 0 Ops account for 0.00%(0.00s) of the runtime)

	Apply
	------
	<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
	29.1% 29.1% 0.072s 7.24e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{
	12.5% 41.5% 0.031s 3.10e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
	12.4% 53.9% 0.031s 3.09e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
	12.0% 65.9% 0.030s 2.98e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
	8.3% 74.2% 0.021s 2.08e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
	8.1% 82.4% 0.020s 2.03e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
	7.9% 90.3% 0.020s 1.97e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
	7.6% 97.8% 0.019s 1.88e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
	0.7% 98.5% 0.002s 1.81e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
	0.4% 99.0% 0.001s 1.10e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
	0.4% 99.4% 0.001s 9.84e-07s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
	0.3% 99.7% 0.001s 8.51e-07s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
	0.3% 100.0% 0.001s 7.09e-07s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>)
	... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)

	Function profiling
	==================
	Message: Sum of all printed profiles at exit
	Time in 1 calls to Function.__call__: 3.572400e-01s
	Time in Function.fn.__call__: 6.452653e-01s (180.625%)
	Time in thunks: 6.063292e-01s (169.726%)
	Total compile time: 9.907391e-01s
	Theano Optimizer time: 7.288530e-01s
	Theano validate time: 2.013111e-02s
	Theano Linker time (includes C, CUDA code generation/compiling): 2.546458e-01s

	Class
	---
	<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
	58.9% 58.9% 0.357s 3.57e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'>
	15.2% 74.0% 0.092s 3.06e-05s C 3001 4 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
	11.9% 86.0% 0.072s 7.24e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
	10.0% 96.0% 0.061s 2.02e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'>
	3.1% 99.1% 0.019s 1.88e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
	0.5% 99.6% 0.003s 9.77e-07s C 3000 3 <class 'theano.tensor.opt.Shape_i'>
	0.3% 99.9% 0.002s 1.81e-06s C 1001 2 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
	0.1% 100.0% 0.001s 7.10e-07s C 1005 6 <class 'theano.tensor.basic.ScalarFromTensor'>
	0.0% 100.0% 0.000s 3.81e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'>
	0.0% 100.0% 0.000s 3.22e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
	0.0% 100.0% 0.000s 1.50e-05s Py 1 1 <class 'theano.tensor.basic.ARange'>
	0.0% 100.0% 0.000s 5.96e-06s C 1 1 <class 'theano.tensor.subtensor.Subtensor'>
	... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)

	Ops
	---
	<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
	58.9% 58.9% 0.357s 3.57e-01s Py 1 1 forall_inplace,gpu,scan_fn}
	15.2% 74.0% 0.092s 3.06e-05s C 3001 4 GpuAlloc{memset_0=True}
	11.9% 86.0% 0.072s 7.24e-05s C 1000 1 GpuJoin
	10.0% 96.0% 0.061s 2.02e-05s C 3000 3 GpuGemv{inplace}
	3.1% 99.1% 0.019s 1.88e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
	0.5% 99.6% 0.003s 9.77e-07s C 3000 3 Shape_i{0}
	0.3% 99.9% 0.002s 1.81e-06s C 1000 1 GpuSubtensor{int32}
	0.1% 100.0% 0.001s 7.10e-07s C 1005 6 ScalarFromTensor
	0.0% 100.0% 0.000s 3.22e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:}
	0.0% 100.0% 0.000s 1.50e-05s Py 1 1 ARange
	0.0% 100.0% 0.000s 3.46e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
	0.0% 100.0% 0.000s 6.91e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
	0.0% 100.0% 0.000s 5.96e-06s C 1 1 Subtensor{int64:int64:int8}
	0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
	0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
	0.0% 100.0% 0.000s 4.77e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
	0.0% 100.0% 0.000s 4.05e-06s C 1 1 GpuSubtensor{int64:int64:int8}
	0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
	0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Cast{int64}}
	0.0% 100.0% 0.000s 2.86e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
	... (remaining 2 Ops account for 0.00%(0.00s) of the runtime)

	Apply
	------
	<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
	58.9% 58.9% 0.357s 3.57e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
	11.9% 70.8% 0.072s 7.24e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{
	5.1% 75.9% 0.031s 3.10e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
	5.1% 81.0% 0.031s 3.09e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
	4.9% 85.9% 0.030s 2.98e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
	3.4% 89.4% 0.021s 2.08e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
	3.3% 92.7% 0.020s 2.03e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
	3.2% 95.9% 0.020s 1.97e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
	3.1% 99.1% 0.019s 1.88e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
	0.3% 99.3% 0.002s 1.81e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
	0.2% 99.5% 0.001s 1.10e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
	0.2% 99.7% 0.001s 9.84e-07s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
	0.1% 99.8% 0.001s 8.51e-07s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
	0.1% 100.0% 0.001s 7.09e-07s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>)
	0.0% 100.0% 0.000s 1.99e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 1.09508501e+21]]}, Elemwise{Compo
	0.0% 100.0% 0.000s 3.22e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
	0.0% 100.0% 0.000s 1.50e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
	0.0% 100.0% 0.000s 6.91e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
	0.0% 100.0% 0.000s 5.96e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
	0.0% 100.0% 0.000s 5.96e-06s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
	... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime)