Created
September 9, 2013 11:54
-
-
Save shackenberg/6494612 to your computer and use it in GitHub Desktop.
Profile results for forward function in Theano using scan
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numpy import zeros, dot, exp, tanh, array, allclose | |
from numpy.random import randn | |
from copy import deepcopy | |
from time import time | |
from theano import tensor as T | |
from theano import function, shared, config, scan | |
FLOAT_PRECISION = config.floatX | |
class Network: | |
def __init__(self,ni,ns,initial=0.1,maxlen=2500): | |
na = 1+ni+ns | |
self.dims = ni,ns,na | |
self.init_variables(initial) | |
def init_variables(self,initial,maxlen=2500): | |
n = maxlen | |
ni,ns,na = self.dims | |
self.WGI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION) | |
self.WGO = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION) | |
self.WCI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION) | |
self.source = array(zeros([n,na]), dtype=FLOAT_PRECISION) | |
self.cix = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.ci = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.gix = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.gi = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.gox = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.go = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.state = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
self.output = array(zeros([n,ns]), dtype=FLOAT_PRECISION) | |
def forward(self,xs): | |
def ffunc(x): | |
return 1.0/(1.0+exp(-x)) | |
ni,ns,na = self.dims | |
prev = zeros(ns) | |
n = len(xs) | |
for t in range(n): | |
self.source[t,0] = 1 | |
self.source[t,1:1+ni] = xs[t] | |
self.source[t,1+ni:] = prev | |
dot(self.WGI,self.source[t],out=self.gix[t]) | |
dot(self.WGO,self.source[t],out=self.gox[t]) | |
dot(self.WCI,self.source[t],out=self.cix[t]) | |
self.gi[t] = ffunc(self.gix[t]) | |
self.ci[t] = tanh(self.cix[t]) | |
self.state[t] = self.ci[t]*self.gi[t] | |
self.go[t] = ffunc(self.gox[t]) | |
self.output[t] = tanh(self.state[t]) * self.go[t] | |
prev = self.output[t] | |
return self.output[:n] | |
class Network_Theano_Scan(): | |
def __init__(self, original_net, ni,ns,maxlen=2500): | |
na = 1+ni+ns | |
self.copy_weights(original_net) | |
# to make sure, both networks produce the same results | |
self.uploadweightsTheano() | |
self.initforwardTheano(ns, maxlen, na) | |
def copy_weights(self, original_net): | |
self.WGI = deepcopy(original_net.WGI) | |
self.WGO = deepcopy(original_net.WGO) | |
self.WCI = deepcopy(original_net.WCI) | |
def uploadweightsTheano(self): | |
self.TWGI_shared = shared(self.WGI) | |
self.TWGO_shared = shared(self.WGO) | |
self.TWCI_shared = shared(self.WCI) | |
def initforwardTheano(self, ns, n, na): | |
def Tffunc(x): | |
Tone = array([1.0], dtype=FLOAT_PRECISION) | |
return Tone/(Tone+T.exp(-x)) | |
self.Toutput = shared(zeros([n, ns], dtype=FLOAT_PRECISION)) | |
self.Tgo_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION)) | |
self.Txs_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION)) | |
Tone = array([1.0], dtype=FLOAT_PRECISION) | |
Tn = T.iscalar('Tn') | |
Ta = T.arange(Tn) | |
def step(Tt, Tprev_output): | |
Txs = self.Txs_shared[Tt] | |
Tsource = T.concatenate([Tone, Txs, Tprev_output]) | |
Tgix = T.dot(self.TWGI_shared, Tsource) | |
Tgox = T.dot(self.TWGO_shared, Tsource) | |
Tcix = T.dot(self.TWCI_shared, Tsource) | |
Tgi = Tone/(Tone+T.exp(-Tgix)) | |
Tci = T.tanh(Tcix) | |
Tstate = Tci * Tgi | |
Tgo = Tffunc(Tgox) | |
output = T.tanh(Tstate) * Tgo | |
return output | |
Toutput0 = T.vector('Toutput0') | |
Toutput0 = T.zeros([ns]) | |
Toutput, _ = scan(step, | |
sequences=[Ta], | |
outputs_info=[Toutput0], | |
non_sequences=[]) | |
TToutput = (self.Toutput, T.set_subtensor(self.Toutput[:Tn], Toutput)) | |
updates = [TToutput] | |
self.Tforward = function([Tn], outputs=[], updates=updates) | |
def forward(self, xs): | |
n = len(xs) | |
self.Tforward(n) | |
# init | |
ninput = 48 | |
nstates = 100 | |
seqlength = 1000 | |
network_orig = Network(ninput, nstates) | |
network_theano_scan = Network_Theano_Scan(network_orig, ninput, nstates) | |
data = array(randn(seqlength, ninput), dtype=FLOAT_PRECISION) | |
# numpy | |
starttime = time() | |
output = network_orig.forward(data) | |
print "nympy takes {}s".format(time() - starttime) | |
# theano_scan | |
network_theano_scan.Txs_shared.set_value(data) | |
starttime = time() | |
network_theano_scan.forward(data) | |
print "theano scan takes {}s".format(time() - starttime) | |
output_theano_scan = network_theano_scan.Toutput.get_value()[:seqlength] | |
#check the results | |
rtol=1e-04 | |
atol=1e-05 | |
if not allclose(output, output_theano_scan, rtol, atol): | |
import pdb; pdb.set_trace() | |
print "pass: theano scan code" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile=True | |
Function profiling | |
================== | |
Message: None | |
Time in 1 calls to Function.__call__: 3.572400e-01s | |
Time in Function.fn.__call__: 3.571680e-01s (99.980%) | |
Time in thunks: 3.571451e-01s (99.973%) | |
Total compile time: 7.711120e-01s | |
Theano Optimizer time: 5.242310e-01s | |
Theano validate time: 1.065993e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.425768e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
99.9% 99.9% 0.357s 3.57e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'> | |
0.1% 100.0% 0.000s 1.99e-04s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'> | |
0.0% 100.0% 0.000s 3.81e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'> | |
0.0% 100.0% 0.000s 3.22e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'> | |
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 <class 'theano.tensor.basic.ARange'> | |
0.0% 100.0% 0.000s 5.96e-06s C 1 1 <class 'theano.tensor.subtensor.Subtensor'> | |
0.0% 100.0% 0.000s 9.54e-07s C 5 5 <class 'theano.tensor.basic.ScalarFromTensor'> | |
0.0% 100.0% 0.000s 4.05e-06s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'> | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
99.9% 99.9% 0.357s 3.57e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
0.1% 100.0% 0.000s 1.99e-04s C 1 1 GpuAlloc{memset_0=True} | |
0.0% 100.0% 0.000s 3.22e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:} | |
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 ARange | |
0.0% 100.0% 0.000s 3.46e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, | |
0.0% 100.0% 0.000s 6.91e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}} | |
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Subtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2 | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), | |
0.0% 100.0% 0.000s 9.54e-07s C 5 5 ScalarFromTensor | |
0.0% 100.0% 0.000s 4.77e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)] | |
0.0% 100.0% 0.000s 4.05e-06s C 1 1 GpuSubtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos | |
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Cast{int64}} | |
0.0% 100.0% 0.000s 2.86e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}} | |
0.0% 100.0% 0.000s 2.15e-06s C 1 1 Elemwise{add,no_inplace} | |
0.0% 100.0% 0.000s 2.15e-06s C 1 1 Elemwise{le,no_inplace} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
99.9% 99.9% 0.357s 3.57e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}. | |
0.1% 100.0% 0.000s 1.99e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 1.09508501e+21]]}, Elemwise{Compo | |
0.0% 100.0% 0.000s 3.22e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten | |
0.0% 100.0% 0.000s 1.50e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1}) | |
0.0% 100.0% 0.000s 6.91e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0, | |
0.0% 100.0% 0.000s 5.96e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0, | |
0.0% 100.0% 0.000s 5.96e-06s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con | |
0.0% 100.0% 0.000s 5.01e-06s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3), | |
0.0% 100.0% 0.000s 5.01e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele | |
0.0% 100.0% 0.000s 4.77e-06s 1 8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp | |
0.0% 100.0% 0.000s 4.05e-06s 1 20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, | |
0.0% 100.0% 0.000s 3.10e-06s 1 10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca | |
0.0% 100.0% 0.000s 3.10e-06s 1 1 Elemwise{Cast{int64}}(Tn) | |
0.0% 100.0% 0.000s 2.86e-06s 1 5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add | |
0.0% 100.0% 0.000s 2.15e-06s 1 3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0) | |
0.0% 100.0% 0.000s 2.15e-06s 1 6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i | |
0.0% 100.0% 0.000s 9.54e-07s 1 14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0, | |
0.0% 100.0% 0.000s 9.54e-07s 1 16 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1 | |
0.0% 100.0% 0.000s 9.54e-07s 1 18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1 | |
0.0% 100.0% 0.000s 9.54e-07s 1 13 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}.0) | |
... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 1000 steps) 3.567450e-01s | |
Total time spent in calling the VM 2.880974e-01s (80.757%) | |
Total overhead (computing slices..) 6.864762e-02s (19.243%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
36.8% 36.8% 0.092s 3.06e-05s C 3000 3 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'> | |
29.1% 65.9% 0.072s 7.24e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'> | |
24.4% 90.3% 0.061s 2.02e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'> | |
7.6% 97.8% 0.019s 1.88e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'> | |
1.2% 99.0% 0.003s 9.77e-07s C 3000 3 <class 'theano.tensor.opt.Shape_i'> | |
0.7% 99.7% 0.002s 1.81e-06s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'> | |
0.3% 100.0% 0.001s 7.09e-07s C 1000 1 <class 'theano.tensor.basic.ScalarFromTensor'> | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
36.8% 36.8% 0.092s 3.06e-05s C 3000 3 GpuAlloc{memset_0=True} | |
29.1% 65.9% 0.072s 7.24e-05s C 1000 1 GpuJoin | |
24.4% 90.3% 0.061s 2.02e-05s C 3000 3 GpuGemv{inplace} | |
7.6% 97.8% 0.019s 1.88e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t | |
1.2% 99.0% 0.003s 9.77e-07s C 3000 3 Shape_i{0} | |
0.7% 99.7% 0.002s 1.81e-06s C 1000 1 GpuSubtensor{int32} | |
0.3% 100.0% 0.001s 7.09e-07s C 1000 1 ScalarFromTensor | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
29.1% 29.1% 0.072s 7.24e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{ | |
12.5% 41.5% 0.031s 3.10e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0) | |
12.4% 53.9% 0.031s 3.09e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0) | |
12.0% 65.9% 0.030s 2.98e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0) | |
8.3% 74.2% 0.021s 2.08e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
8.1% 82.4% 0.020s 2.03e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
7.9% 90.3% 0.020s 1.97e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
7.6% 97.8% 0.019s 1.88e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}( | |
0.7% 98.5% 0.002s 1.81e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0) | |
0.4% 99.0% 0.001s 1.10e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
0.4% 99.4% 0.001s 9.84e-07s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
0.3% 99.7% 0.001s 8.51e-07s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
0.3% 100.0% 0.001s 7.09e-07s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Function profiling | |
================== | |
Message: Sum of all printed profiles at exit | |
Time in 1 calls to Function.__call__: 3.572400e-01s | |
Time in Function.fn.__call__: 6.452653e-01s (180.625%) | |
Time in thunks: 6.063292e-01s (169.726%) | |
Total compile time: 9.907391e-01s | |
Theano Optimizer time: 7.288530e-01s | |
Theano validate time: 2.013111e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.546458e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
58.9% 58.9% 0.357s 3.57e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'> | |
15.2% 74.0% 0.092s 3.06e-05s C 3001 4 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'> | |
11.9% 86.0% 0.072s 7.24e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'> | |
10.0% 96.0% 0.061s 2.02e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'> | |
3.1% 99.1% 0.019s 1.88e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'> | |
0.5% 99.6% 0.003s 9.77e-07s C 3000 3 <class 'theano.tensor.opt.Shape_i'> | |
0.3% 99.9% 0.002s 1.81e-06s C 1001 2 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'> | |
0.1% 100.0% 0.001s 7.10e-07s C 1005 6 <class 'theano.tensor.basic.ScalarFromTensor'> | |
0.0% 100.0% 0.000s 3.81e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'> | |
0.0% 100.0% 0.000s 3.22e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'> | |
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 <class 'theano.tensor.basic.ARange'> | |
0.0% 100.0% 0.000s 5.96e-06s C 1 1 <class 'theano.tensor.subtensor.Subtensor'> | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
58.9% 58.9% 0.357s 3.57e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
15.2% 74.0% 0.092s 3.06e-05s C 3001 4 GpuAlloc{memset_0=True} | |
11.9% 86.0% 0.072s 7.24e-05s C 1000 1 GpuJoin | |
10.0% 96.0% 0.061s 2.02e-05s C 3000 3 GpuGemv{inplace} | |
3.1% 99.1% 0.019s 1.88e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t | |
0.5% 99.6% 0.003s 9.77e-07s C 3000 3 Shape_i{0} | |
0.3% 99.9% 0.002s 1.81e-06s C 1000 1 GpuSubtensor{int32} | |
0.1% 100.0% 0.001s 7.10e-07s C 1005 6 ScalarFromTensor | |
0.0% 100.0% 0.000s 3.22e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:} | |
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 ARange | |
0.0% 100.0% 0.000s 3.46e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, | |
0.0% 100.0% 0.000s 6.91e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}} | |
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Subtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2 | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), | |
0.0% 100.0% 0.000s 4.77e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)] | |
0.0% 100.0% 0.000s 4.05e-06s C 1 1 GpuSubtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos | |
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Cast{int64}} | |
0.0% 100.0% 0.000s 2.86e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}} | |
... (remaining 2 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
58.9% 58.9% 0.357s 3.57e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}. | |
11.9% 70.8% 0.072s 7.24e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{ | |
5.1% 75.9% 0.031s 3.10e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0) | |
5.1% 81.0% 0.031s 3.09e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0) | |
4.9% 85.9% 0.030s 2.98e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0) | |
3.4% 89.4% 0.021s 2.08e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
3.3% 92.7% 0.020s 2.03e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
3.2% 95.9% 0.020s 1.97e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
3.1% 99.1% 0.019s 1.88e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}( | |
0.3% 99.3% 0.002s 1.81e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0) | |
0.2% 99.5% 0.001s 1.10e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
0.2% 99.7% 0.001s 9.84e-07s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
0.1% 99.8% 0.001s 8.51e-07s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
0.1% 100.0% 0.001s 7.09e-07s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>) | |
0.0% 100.0% 0.000s 1.99e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 1.09508501e+21]]}, Elemwise{Compo | |
0.0% 100.0% 0.000s 3.22e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten | |
0.0% 100.0% 0.000s 1.50e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1}) | |
0.0% 100.0% 0.000s 6.91e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0, | |
0.0% 100.0% 0.000s 5.96e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0, | |
0.0% 100.0% 0.000s 5.96e-06s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con | |
... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile_memory=True,profile=True | |
Function profiling | |
================== | |
Message: None | |
Time in 1 calls to Function.__call__: 8.473389e-01s | |
Time in Function.fn.__call__: 8.472540e-01s (99.990%) | |
Time in thunks: 8.466234e-01s (99.916%) | |
Total compile time: 7.679639e-01s | |
Theano Optimizer time: 5.219090e-01s | |
Theano validate time: 1.056147e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.417691e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
100.0% 100.0% 0.846s 8.46e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'> | |
0.0% 100.0% 0.000s 2.18e-04s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'> | |
0.0% 100.0% 0.000s 6.44e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'> | |
0.0% 100.0% 0.000s 3.81e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'> | |
0.0% 100.0% 0.000s 3.62e-06s C 5 5 <class 'theano.tensor.basic.ScalarFromTensor'> | |
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 <class 'theano.tensor.basic.ARange'> | |
0.0% 100.0% 0.000s 1.00e-05s C 1 1 <class 'theano.tensor.subtensor.Subtensor'> | |
0.0% 100.0% 0.000s 8.11e-06s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'> | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
100.0% 100.0% 0.846s 8.46e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
0.0% 100.0% 0.000s 2.18e-04s C 1 1 GpuAlloc{memset_0=True} | |
0.0% 100.0% 0.000s 3.81e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:} | |
0.0% 100.0% 0.000s 3.62e-06s C 5 5 ScalarFromTensor | |
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 ARange | |
0.0% 100.0% 0.000s 5.36e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, | |
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Subtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2 | |
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), | |
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}} | |
0.0% 100.0% 0.000s 8.11e-06s C 1 1 GpuSubtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 8.11e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)] | |
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{le,no_inplace} | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Cast{int64}} | |
0.0% 100.0% 0.000s 4.05e-06s C 1 1 Elemwise{add,no_inplace} | |
0.0% 100.0% 0.000s 3.81e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
100.0% 100.0% 0.846s 8.46e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}. | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int32, shape=(1000,), strides=c | |
input 2: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
input 3: dtype=float32, shape=(100, 149), strides=c | |
input 4: dtype=float32, shape=(1000, 48), strides=c | |
input 5: dtype=float32, shape=(100, 149), strides=c | |
input 6: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
0.0% 100.0% 0.000s 2.18e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composi | |
input 0: dtype=float32, shape=(1, 1), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
0.0% 100.0% 0.000s 3.81e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten | |
input 0: dtype=float32, shape=(2500, 100), strides=c | |
input 1: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
input 2: dtype=int32, shape=4, strides=c | |
output 0: dtype=float32, shape=(2500, 100), strides=(100, 1) | |
0.0% 100.0% 0.000s 1.60e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1}) | |
input 0: dtype=int8, shape=(), strides=c | |
input 1: dtype=int32, shape=(), strides=c | |
input 2: dtype=int8, shape=(), strides=c | |
output 0: dtype=int32, shape=(1000,), strides=c | |
0.0% 100.0% 0.000s 1.00e-05s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3), | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int8, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 1.00e-05s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con | |
input 0: dtype=int32, shape=(1000,), strides=c | |
input 1: dtype=int64, shape=8, strides=c | |
input 2: dtype=int64, shape=8, strides=c | |
input 3: dtype=int8, shape=1, strides=c | |
output 0: dtype=int32, shape=(1000,), strides=c | |
0.0% 100.0% 0.000s 9.06e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 9.06e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0, | |
input 0: dtype=int8, shape=(), strides=c | |
input 1: dtype=int8, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
input 3: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 8.11e-06s 1 8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp | |
input 0: dtype=int8, shape=(), strides=c | |
input 1: dtype=int8, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
input 3: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 8.11e-06s 1 20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, | |
input 0: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
input 1: dtype=int64, shape=8, strides=c | |
input 2: dtype=int64, shape=8, strides=c | |
input 3: dtype=int8, shape=1, strides=c | |
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
0.0% 100.0% 0.000s 7.87e-06s 1 14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0, | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
input 3: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 6.91e-06s 1 0 ScalarFromTensor(Tn) | |
input 0: dtype=int32, shape=(), strides=c | |
output 0: dtype=int32, shape=4, strides=c | |
0.0% 100.0% 0.000s 5.96e-06s 1 10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int8, shape=(), strides=c | |
input 3: dtype=int64, shape=(), strides=c | |
input 4: dtype=int8, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 5.01e-06s 1 6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int8, shape=(), strides=c | |
output 0: dtype=int8, shape=(), strides=c | |
0.0% 100.0% 0.000s 5.01e-06s 1 1 Elemwise{Cast{int64}}(Tn) | |
input 0: dtype=int32, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 4.05e-06s 1 3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0) | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 3.81e-06s 1 5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 3.10e-06s 1 18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1 | |
input 0: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=8, strides=c | |
0.0% 100.0% 0.000s 3.10e-06s 1 12 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)].0 | |
input 0: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=8, strides=c | |
0.0% 100.0% 0.000s 2.86e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0, | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
input 3: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime) | |
Memory Profile | |
(Sparse variables are ignored) | |
--- | |
Max if linker=cvm (default): unknown | |
Max if no gc (allow_gc=False): 395KB | |
Max if linker=c|py: 395KB | |
Memory saved if gc is enabled (linker=c|py): 0KB | |
<Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node> | |
1000000B [(2500, 100)] i GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubtensor{int64:int64:int8}.0, ScalarFromTensor.0) | |
400000B [(1000, 100)] c GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Cast{int64}(i0), int_div(i1, i2))]}(NEQ(i0, i1), i2, i3)]}(mod(i0, i1), i2, i0, i1)]}(Composite{[Composite{[Composite{[sub(Switch(i0, i1, i2), i3)]}(LT(i0, i1), Switch(i2, i1, i3), Switch(i4, i0, i5), i1)]}(i0, i1, LT(i2, i1), i2, LT(i0, i3), i3)]}(i0, i1, i2, i3), i4, i1)]}(i0, i1, add(i0, i2), i2, i3)]}(maximum(i0, i1), i2, i3, i4)]}}.0, TensorConstant{100}) | |
400000B [(1000, 100)] i forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>) | |
400000B [(1000, 100)] v GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1}) | |
4000B [(1000,)] c ARange(TensorConstant{0}, Tn, TensorConstant{1}) | |
4000B [(1000,)] v Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1}) | |
... (remaining 16 Apply account for 117B/2208117B ((0.01%)) of the Apply with dense outputs sizes) | |
<created/inplace/view> is taken from the Op's declaration. | |
Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases. | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 1000 steps) 8.461490e-01s | |
Total time spent in calling the VM 7.661736e-01s (90.548%) | |
Total overhead (computing slices..) 7.997537e-02s (9.452%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
37.2% 37.2% 0.131s 4.37e-05s C 3000 3 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'> | |
25.3% 62.6% 0.089s 2.97e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'> | |
24.8% 87.3% 0.087s 8.73e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'> | |
7.4% 94.8% 0.026s 2.62e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'> | |
3.1% 97.9% 0.011s 3.61e-06s C 3000 3 <class 'theano.tensor.opt.Shape_i'> | |
1.1% 99.0% 0.004s 3.88e-06s C 1000 1 <class 'theano.tensor.basic.ScalarFromTensor'> | |
1.0% 100.0% 0.004s 3.65e-06s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'> | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
37.2% 37.2% 0.131s 4.37e-05s C 3000 3 GpuAlloc{memset_0=True} | |
25.3% 62.6% 0.089s 2.97e-05s C 3000 3 GpuGemv{inplace} | |
24.8% 87.3% 0.087s 8.73e-05s C 1000 1 GpuJoin | |
7.4% 94.8% 0.026s 2.62e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t | |
3.1% 97.9% 0.011s 3.61e-06s C 3000 3 Shape_i{0} | |
1.1% 99.0% 0.004s 3.88e-06s C 1000 1 ScalarFromTensor | |
1.0% 100.0% 0.004s 3.65e-06s C 1000 1 GpuSubtensor{int32} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
24.8% 24.8% 0.087s 8.73e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{ | |
input 0: dtype=int8, shape=(), strides=c | |
input 1: dtype=float32, shape=(1,), strides=c | |
input 2: dtype=float32, shape=(48,), strides=(1,) | |
input 3: dtype=float32, shape=(100,), strides=c | |
output 0: dtype=float32, shape=(149,), strides=(1,) | |
12.5% 37.3% 0.044s 4.41e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0) | |
input 0: dtype=float32, shape=(1,), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
12.5% 49.8% 0.044s 4.39e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0) | |
input 0: dtype=float32, shape=(1,), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
12.2% 62.0% 0.043s 4.31e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0) | |
input 0: dtype=float32, shape=(1,), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
8.5% 70.5% 0.030s 2.99e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(), strides=c | |
input 2: dtype=float32, shape=(100, 149), strides=c | |
input 3: dtype=float32, shape=(149,), strides=(1,) | |
input 4: dtype=float32, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
8.4% 79.0% 0.030s 2.97e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(), strides=c | |
input 2: dtype=float32, shape=(100, 149), strides=c | |
input 3: dtype=float32, shape=(149,), strides=(1,) | |
input 4: dtype=float32, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
8.4% 87.3% 0.030s 2.96e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(), strides=c | |
input 2: dtype=float32, shape=(100, 149), strides=c | |
input 3: dtype=float32, shape=(149,), strides=(1,) | |
input 4: dtype=float32, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
7.4% 94.8% 0.026s 2.62e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}( | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(100,), strides=(1,) | |
input 2: dtype=float32, shape=(100,), strides=(1,) | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
1.1% 95.9% 0.004s 3.93e-06s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
input 0: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
1.1% 97.0% 0.004s 3.88e-06s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>) | |
input 0: dtype=int32, shape=(), strides=c | |
output 0: dtype=int32, shape=4, strides=c | |
1.0% 98.0% 0.004s 3.65e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0) | |
input 0: dtype=float32, shape=(1000, 48), strides=c | |
input 1: dtype=int32, shape=4, strides=c | |
output 0: dtype=float32, shape=(48,), strides=(1,) | |
1.0% 99.0% 0.003s 3.48e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
input 0: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
1.0% 100.0% 0.003s 3.40e-06s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
input 0: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Memory Profile | |
(Sparse variables are ignored) | |
--- | |
Max if linker=cvm (default): unknown | |
Max if no gc (allow_gc=False): 2KB | |
Max if linker=c|py: 2KB | |
Memory saved if gc is enabled (linker=c|py): 0KB | |
<Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node> | |
... (remaining 13 Apply account for 3616B/3616B ((100.00%)) of the Apply with dense outputs sizes) | |
All Apply nodes have output sizes that take less than 1024B. | |
<created/inplace/view> is taken from the Op's declaration. | |
Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases. | |
Function profiling | |
================== | |
Message: Sum of all printed profiles at exit | |
Time in 1 calls to Function.__call__: 8.473389e-01s | |
Time in Function.fn.__call__: 1.613428e+00s (190.411%) | |
Time in thunks: 1.198847e+00s (141.484%) | |
Total compile time: 9.870379e-01s | |
Theano Optimizer time: 7.257051e-01s | |
Theano validate time: 1.985645e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.541780e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
70.6% 70.6% 0.846s 8.46e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'> | |
11.0% 81.5% 0.131s 4.38e-05s C 3001 4 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'> | |
7.4% 89.0% 0.089s 2.97e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'> | |
7.3% 96.3% 0.087s 8.73e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'> | |
2.2% 98.5% 0.026s 2.62e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'> | |
0.9% 99.4% 0.011s 3.61e-06s C 3000 3 <class 'theano.tensor.opt.Shape_i'> | |
0.3% 99.7% 0.004s 3.88e-06s C 1005 6 <class 'theano.tensor.basic.ScalarFromTensor'> | |
0.3% 100.0% 0.004s 3.65e-06s C 1001 2 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'> | |
0.0% 100.0% 0.000s 6.44e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'> | |
0.0% 100.0% 0.000s 3.81e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'> | |
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 <class 'theano.tensor.basic.ARange'> | |
0.0% 100.0% 0.000s 1.00e-05s C 1 1 <class 'theano.tensor.subtensor.Subtensor'> | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
70.6% 70.6% 0.846s 8.46e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
11.0% 81.5% 0.131s 4.38e-05s C 3001 4 GpuAlloc{memset_0=True} | |
7.4% 89.0% 0.089s 2.97e-05s C 3000 3 GpuGemv{inplace} | |
7.3% 96.3% 0.087s 8.73e-05s C 1000 1 GpuJoin | |
2.2% 98.5% 0.026s 2.62e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t | |
0.9% 99.4% 0.011s 3.61e-06s C 3000 3 Shape_i{0} | |
0.3% 99.7% 0.004s 3.88e-06s C 1005 6 ScalarFromTensor | |
0.3% 100.0% 0.004s 3.65e-06s C 1000 1 GpuSubtensor{int32} | |
0.0% 100.0% 0.000s 3.81e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:} | |
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 ARange | |
0.0% 100.0% 0.000s 5.36e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, | |
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Subtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2 | |
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), | |
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}} | |
0.0% 100.0% 0.000s 8.11e-06s C 1 1 GpuSubtensor{int64:int64:int8} | |
0.0% 100.0% 0.000s 8.11e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)] | |
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{le,no_inplace} | |
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Cast{int64}} | |
... (remaining 2 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
70.6% 70.6% 0.846s 8.46e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}. | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int32, shape=(1000,), strides=c | |
input 2: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
input 3: dtype=float32, shape=(100, 149), strides=c | |
input 4: dtype=float32, shape=(1000, 48), strides=c | |
input 5: dtype=float32, shape=(100, 149), strides=c | |
input 6: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
7.3% 77.9% 0.087s 8.73e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{ | |
input 0: dtype=int8, shape=(), strides=c | |
input 1: dtype=float32, shape=(1,), strides=c | |
input 2: dtype=float32, shape=(48,), strides=(1,) | |
input 3: dtype=float32, shape=(100,), strides=c | |
output 0: dtype=float32, shape=(149,), strides=(1,) | |
3.7% 81.5% 0.044s 4.41e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0) | |
input 0: dtype=float32, shape=(1,), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
3.7% 85.2% 0.044s 4.39e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0) | |
input 0: dtype=float32, shape=(1,), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
3.6% 88.8% 0.043s 4.31e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0) | |
input 0: dtype=float32, shape=(1,), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
2.5% 91.3% 0.030s 2.99e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(), strides=c | |
input 2: dtype=float32, shape=(100, 149), strides=c | |
input 3: dtype=float32, shape=(149,), strides=(1,) | |
input 4: dtype=float32, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
2.5% 93.8% 0.030s 2.97e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(), strides=c | |
input 2: dtype=float32, shape=(100, 149), strides=c | |
input 3: dtype=float32, shape=(149,), strides=(1,) | |
input 4: dtype=float32, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
2.5% 96.3% 0.030s 2.96e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(), strides=c | |
input 2: dtype=float32, shape=(100, 149), strides=c | |
input 3: dtype=float32, shape=(149,), strides=(1,) | |
input 4: dtype=float32, shape=(), strides=c | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
2.2% 98.4% 0.026s 2.62e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}( | |
input 0: dtype=float32, shape=(100,), strides=(1,) | |
input 1: dtype=float32, shape=(100,), strides=(1,) | |
input 2: dtype=float32, shape=(100,), strides=(1,) | |
output 0: dtype=float32, shape=(100,), strides=(1,) | |
0.3% 98.8% 0.004s 3.93e-06s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
input 0: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.3% 99.1% 0.004s 3.88e-06s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>) | |
input 0: dtype=int32, shape=(), strides=c | |
output 0: dtype=int32, shape=4, strides=c | |
0.3% 99.4% 0.004s 3.65e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0) | |
input 0: dtype=float32, shape=(1000, 48), strides=c | |
input 1: dtype=int32, shape=4, strides=c | |
output 0: dtype=float32, shape=(48,), strides=(1,) | |
0.3% 99.7% 0.003s 3.48e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
input 0: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.3% 100.0% 0.003s 3.40e-06s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>) | |
input 0: dtype=float32, shape=(100, 149), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 2.18e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composi | |
input 0: dtype=float32, shape=(1, 1), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
0.0% 100.0% 0.000s 3.81e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten | |
input 0: dtype=float32, shape=(2500, 100), strides=c | |
input 1: dtype=float32, shape=(1000, 100), strides=(100, 1) | |
input 2: dtype=int32, shape=4, strides=c | |
output 0: dtype=float32, shape=(2500, 100), strides=(100, 1) | |
0.0% 100.0% 0.000s 1.60e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1}) | |
input 0: dtype=int8, shape=(), strides=c | |
input 1: dtype=int32, shape=(), strides=c | |
input 2: dtype=int8, shape=(), strides=c | |
output 0: dtype=int32, shape=(1000,), strides=c | |
0.0% 100.0% 0.000s 1.00e-05s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3), | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int8, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
0.0% 100.0% 0.000s 1.00e-05s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con | |
input 0: dtype=int32, shape=(1000,), strides=c | |
input 1: dtype=int64, shape=8, strides=c | |
input 2: dtype=int64, shape=8, strides=c | |
input 3: dtype=int8, shape=1, strides=c | |
output 0: dtype=int32, shape=(1000,), strides=c | |
0.0% 100.0% 0.000s 9.06e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele | |
input 0: dtype=int64, shape=(), strides=c | |
input 1: dtype=int64, shape=(), strides=c | |
input 2: dtype=int64, shape=(), strides=c | |
output 0: dtype=int64, shape=(), strides=c | |
... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime) | |
Memory Profile (the max between all functions in that profile) | |
(Sparse variables are ignored) | |
--- | |
Max if linker=cvm (default): unknown | |
Max if no gc (allow_gc=False): 395KB | |
Max if linker=c|py: 395KB | |
Memory saved if gc is enabled (linker=c|py): 0KB | |
This list is based on all functions in the profile | |
<Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node> | |
1000000B [(2500, 100)] i GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubtensor{int64:int64:int8}.0, ScalarFromTensor.0) | |
400000B [(1000, 100)] c GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Cast{int64}(i0), int_div(i1, i2))]}(NEQ(i0, i1), i2, i3)]}(mod(i0, i1), i2, i0, i1)]}(Composite{[Composite{[Composite{[sub(Switch(i0, i1, i2), i3)]}(LT(i0, i1), Switch(i2, i1, i3), Switch(i4, i0, i5), i1)]}(i0, i1, LT(i2, i1), i2, LT(i0, i3), i3)]}(i0, i1, i2, i3), i4, i1)]}(i0, i1, add(i0, i2), i2, i3)]}(maximum(i0, i1), i2, i3, i4)]}}.0, TensorConstant{100}) | |
400000B [(1000, 100)] i forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>) | |
400000B [(1000, 100)] v GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1}) | |
4000B [(1000,)] c ARange(TensorConstant{0}, Tn, TensorConstant{1}) | |
4000B [(1000,)] v Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1}) | |
... (remaining 29 Apply account for 3733B/2211733B ((0.17%)) of the Apply with dense outputs sizes) | |
<created/inplace/view> is taken from the Op's declaration. | |
Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment