ffmpbgrnn · August 29, 2015 14:25
diff --git a/cpu.log b/cpu.log
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:328
  Time in 8 calls to Function.__call__: 1.477107e+02s
  Time in Function.fn.__call__: 1.474529e+02s (99.825%)
  Time in thunks: 1.470501e+02s (99.553%)
  Total compile time: 1.377320e+02s
    Number of Apply nodes: 525
    Theano Optimizer time: 2.186309e+01s
       Theano validate time: 2.525887e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 1.157349e+02s
       Import time 3.757732e-01s

 Time in all call to theano.grad() 4.098411e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  64.7%    64.7%      95.128s       9.91e-01s     C       96      12   theano.tensor.blas.Dot22
  11.5%    76.2%      16.857s       1.05e+00s     Py      16       2   theano.scan_module.scan_op.Scan
   9.5%    85.6%      13.954s       1.74e+00s     C        8       1   theano.tensor.nnet.nnet.Softmax
   7.9%    93.6%      11.631s       5.37e-03s     C     2168     271   theano.tensor.elemwise.Elemwise
   2.1%    95.7%       3.089s       3.86e-01s     C        8       1   theano.tensor.blas.Dot22Scalar
   1.6%    97.3%       2.389s       3.32e-02s     C       72       9   theano.tensor.elemwise.Sum
   1.4%    98.7%       2.029s       2.54e-01s     Py       8       1   theano.tensor.subtensor.AdvancedIncSubtensor
   0.4%    99.1%       0.618s       7.72e-02s     C        8       1   theano.tensor.nnet.nnet.SoftmaxGrad
   0.3%    99.4%       0.454s       1.89e-02s     Py      24       3   theano.tensor.subtensor.AdvancedSubtensor
   0.3%    99.7%       0.453s       1.95e-03s     C      232      29   theano.tensor.basic.Reshape
   0.2%    99.9%       0.278s       2.32e-03s     C      120      15   theano.tensor.basic.Alloc
   0.1%   100.0%       0.164s       2.28e-03s     C       72       9   theano.tensor.subtensor.IncSubtensor
   0.0%   100.0%       0.002s       2.15e-04s     Py       8       1   theano.tensor.basic.Nonzero
   0.0%   100.0%       0.001s       2.26e-06s     C      424      53   theano.compile.ops.Shape_i
   0.0%   100.0%       0.001s       3.05e-06s     C      264      33   theano.tensor.elemwise.DimShuffle
   0.0%   100.0%       0.001s       4.06e-06s     C      184      23   theano.tensor.opt.MakeVector
   0.0%   100.0%       0.001s       3.71e-06s     C      200      25   theano.tensor.subtensor.Subtensor
   0.0%   100.0%       0.000s       1.16e-06s     C      248      31   theano.tensor.basic.ScalarFromTensor
   0.0%   100.0%       0.000s       2.26e-05s     C        8       1   theano.tensor.basic.Join
   0.0%   100.0%       0.000s       2.40e-06s     C       24       3   theano.tensor.basic.Flatten
   ... (remaining 1 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  64.7%    64.7%      95.128s       9.91e-01s     C       96       12   Dot22
   9.5%    74.2%      13.954s       1.74e+00s     C        8        1   Softmax
   7.1%    81.3%      10.468s       1.31e+00s     Py       8        1   forall_inplace,cpu,grad_of_scan_fn}
   5.4%    86.7%       7.876s       9.85e-01s     C        8        1   Elemwise{Composite{(i0 * log((i1 / i2)))}}
   4.3%    91.0%       6.388s       7.99e-01s     Py       8        1   forall_inplace,cpu,scan_fn}
   2.1%    93.1%       3.089s       3.86e-01s     C        8        1   Dot22Scalar
   1.4%    94.5%       2.029s       2.54e-01s     Py       8        1   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}
   1.3%    95.8%       1.907s       4.77e-02s     C       40        5   Sum{axis=[0, 1], acc_dtype=float64}
   1.2%    97.0%       1.788s       2.24e-01s     C        8        1   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}
   0.4%    97.4%       0.618s       7.72e-02s     C        8        1   SoftmaxGrad
   0.4%    97.8%       0.557s       1.51e-03s     C      368       46   Elemwise{add,no_inplace}
   0.3%    98.1%       0.482s       2.01e-02s     C       24        3   Sum{axis=[1], acc_dtype=float64}
   0.3%    98.4%       0.454s       1.89e-02s     Py      24        3   AdvancedSubtensor
   0.3%    98.7%       0.453s       2.83e-03s     C      160       20   Reshape{2}
   0.3%    99.0%       0.418s       5.22e-03s     C       80       10   Elemwise{mul,no_inplace}
   0.2%    99.2%       0.318s       3.98e-02s     C        8        1   Elemwise{clip,no_inplace}
   0.2%    99.4%       0.278s       2.32e-03s     C      120       15   Alloc
   0.2%    99.6%       0.262s       8.19e-03s     C       32        4   Elemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))}}[(0, 0)]
   0.1%    99.7%       0.206s       1.84e-03s     C      112       14   Elemwise{Composite{(i0 * sqr(i1))}}
   0.1%    99.8%       0.101s       3.15e-03s     C       32        4   IncSubtensor{Inc;:int64:}
   ... (remaining 98 Ops account for   0.19%(0.27s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  11.8%    11.8%      17.349s       2.17e+00s      8    92   Dot22(Reshape{2}.0, Reshape{2}.0)
   9.5%    21.3%      13.954s       1.74e+00s      8   397   Softmax(Reshape{2}.0)
   9.1%    30.4%      13.341s       1.67e+00s      8   489   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   7.7%    38.1%      11.309s       1.41e+00s      8    93   Dot22(Reshape{2}.0, Reshape{2}.0)
   7.1%    45.2%      10.468s       1.31e+00s      8   446   forall_inplace,cpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Elemwise{mul,no_inplace}.0, Elemwise{mul,no_inplace}.0, InplaceDimShuffle{0,2,1}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{::int64}.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Elem
   6.6%    51.8%       9.739s       1.22e+00s      8    94   Dot22(Reshape{2}.0, Reshape{2}.0)
   6.5%    58.2%       9.491s       1.19e+00s      8    91   Dot22(Reshape{2}.0, Reshape{2}.0)
   5.6%    63.9%       8.252s       1.03e+00s      8   491   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   5.6%    69.5%       8.230s       1.03e+00s      8   433   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   5.4%    74.8%       7.876s       9.85e-01s      8   411   Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0)
   4.3%    79.2%       6.388s       7.99e-01s      8   337   forall_inplace,cpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Alloc.0, Alloc.0, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>)
   4.0%    83.2%       5.945s       7.43e-01s      8   485   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   3.0%    86.2%       4.401s       5.50e-01s      8   493   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   2.1%    88.3%       3.094s       3.87e-01s      8   486   Dot22(Flatten{2}.0, Reshape{2}.0)
   2.1%    90.4%       3.089s       3.86e-01s      8   487   Dot22Scalar(Flatten{2}.0, Reshape{2}.0, TensorConstant{0.0010000000475})
   1.4%    91.8%       2.092s       2.62e-01s      8   434   Dot22(Reshape{2}.0, InplaceDimShuffle{1,0}.0)
   1.4%    93.2%       2.029s       2.54e-01s      8   425   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0)
   1.3%    94.5%       1.885s       2.36e-01s      8   370   Dot22(Reshape{2}.0, Reshape{2}.0)
   1.2%    95.7%       1.788s       2.24e-01s      8   424   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0)
   1.1%    96.8%       1.561s       1.95e-01s      8   430   Sum{axis=[0, 1], acc_dtype=float64}(InplaceDimShuffle{1,0,2}.0)
   ... (remaining 505 Apply instances account for 3.24%(4.77s) of the runtime)


 Scan Op profiling ( scan_fn )
 ==================
  Message: None
  Time in 8 calls of the op (for a total of 240 steps) 6.386277e+00s

  Total time spent in calling the VM 6.356155e+00s (99.528%)
  Total overhead (computing slices..) 3.012228e-02s (0.472%)

 Time in all call to theano.grad() 4.098411e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  97.9%    97.9%       6.218s       6.48e-03s     C      960       4   theano.tensor.blas.Gemm
   2.1%   100.0%       0.135s       1.88e-04s     C      720       3   theano.tensor.elemwise.Elemwise
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  97.9%    97.9%       6.218s       6.48e-03s     C      960        4   Gemm{no_inplace}
   1.3%    99.2%       0.085s       3.52e-04s     C      240        1   Elemwise{Composite{((i0 * i1 * i2) + (i3 * i4))}}
   0.8%   100.0%       0.051s       1.06e-04s     C      480        2   Elemwise{mul,no_inplace}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  33.6%    33.6%       2.132s       8.88e-03s    240     2   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0})
  23.1%    56.6%       1.467s       6.11e-03s    240     1   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0})
  20.8%    77.5%       1.322s       5.51e-03s    240     4   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0})
  20.4%    97.9%       1.297s       5.40e-03s    240     3   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0})
   1.3%    99.2%       0.085s       3.52e-04s    240     5   Elemwise{Composite{((i0 * i1 * i2) + (i3 * i4))}}(Gemm{no_inplace}.0, <TensorType(int8, col)>, <TensorType(float32, matrix)>, Gemm{no_inplace}.0, Gemm{no_inplace}.0)
   0.5%    99.8%       0.035s       1.45e-04s    240     0   Elemwise{mul,no_inplace}(<TensorType(int8, col)>, <TensorType(float32, matrix)>)
   0.2%   100.0%       0.016s       6.60e-05s    240     6   Elemwise{mul,no_inplace}(Gemm{no_inplace}.0, Elemwise{Composite{((i0 * i1 * i2) + (i3 * i4))}}.0)
   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)


 Scan Op profiling ( grad_of_scan_fn )
 ==================
  Message: None
  Time in 8 calls of the op (for a total of 240 steps) 1.046483e+01s

  Total time spent in calling the VM 1.024789e+01s (97.927%)
  Total overhead (computing slices..) 2.169311e-01s (2.073%)

 Time in all call to theano.grad() 4.098411e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  60.8%    60.8%       6.226s       2.36e-03s     C     2640      11   theano.tensor.blas.Gemm
  32.6%    93.5%       3.340s       2.32e-03s     C     1440       6   theano.tensor.blas.Dot22
   6.5%   100.0%       0.667s       1.85e-04s     C     3600      15   theano.tensor.elemwise.Elemwise
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  37.8%    37.8%       3.867s       2.30e-03s     C     1680        7   Gemm{inplace}
  32.6%    70.4%       3.340s       2.32e-03s     C     1440        6   Dot22
  23.0%    93.5%       2.359s       2.46e-03s     C      960        4   Gemm{no_inplace}
   3.2%    96.6%       0.323s       2.24e-04s     C     1440        6   Elemwise{add,no_inplace}
   1.1%    97.7%       0.111s       4.64e-04s     C      240        1   Elemwise{Composite{(i0 + (i1 * i2 * i3 * i4) + (i1 * i5 * i2))}}
   0.9%    98.7%       0.096s       1.00e-04s     C      960        4   Elemwise{mul}
   0.9%    99.5%       0.088s       3.65e-04s     C      240        1   Elemwise{Composite{(i0 + ((i1 + i2) * i3) + (i4 * i3))}}
   0.3%    99.8%       0.029s       6.00e-05s     C      480        2   Elemwise{Mul}[(0, 1)]
   0.2%   100.0%       0.020s       8.42e-05s     C      240        1   Elemwise{Mul}[(0, 2)]
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
   9.2%     9.2%       0.938s       3.91e-03s    240    26   Gemm{inplace}(Dot22.0, TensorConstant{1.0}, <TensorType(float32, matrix)>, Elemwise{Mul}[(0, 1)].0, TensorConstant{1.0})
   7.3%    16.4%       0.744s       3.10e-03s    240    18   Gemm{inplace}(Dot22.0, TensorConstant{1.0}, Elemwise{mul}.0, <TensorType(float32, matrix)>, TensorConstant{1.0})
   7.1%    23.6%       0.731s       3.04e-03s    240    17   Dot22(Elemwise{mul}.0, <TensorType(float32, matrix)>)
   7.0%    30.6%       0.720s       3.00e-03s    240     0   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0})
   6.1%    36.7%       0.626s       2.61e-03s    240    19   Dot22(<TensorType(float32, matrix)>, Elemwise{Mul}[(0, 2)].0)
   5.7%    42.4%       0.584s       2.43e-03s    240    24   Gemm{inplace}(Dot22.0, TensorConstant{1.0}, <TensorType(float32, matrix)>, Elemwise{Mul}[(0, 1)].0, TensorConstant{1.0})
   5.7%    48.1%       0.581s       2.42e-03s    240    10   Dot22(Elemwise{mul}.0, <TensorType(float32, matrix)>)
   5.5%    53.6%       0.562s       2.34e-03s    240     2   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0})
   5.5%    59.1%       0.561s       2.34e-03s    240     1   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0})
   5.2%    64.3%       0.529s       2.20e-03s    240    14   Dot22(<TensorType(float32, matrix)>, Elemwise{mul}.0)
   5.0%    69.3%       0.515s       2.15e-03s    240     4   Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0})
   4.4%    73.7%       0.446s       1.86e-03s    240    25   Gemm{inplace}(Dot22.0, TensorConstant{1.0}, Elemwise{Mul}[(0, 2)].0, <TensorType(float32, matrix)>, TensorConstant{1.0})
   4.3%    78.0%       0.440s       1.83e-03s    240    16   Dot22(<TensorType(float32, matrix)>, Elemwise{mul}.0)
   4.2%    82.2%       0.434s       1.81e-03s    240     9   Dot22(Elemwise{mul}.0, <TensorType(float32, matrix)>)
   4.0%    86.2%       0.411s       1.71e-03s    240    27   Gemm{inplace}(Gemm{inplace}.0, TensorConstant{1.0}, Elemwise{Mul}[(0, 1)].0, <TensorType(float32, matrix)>, TensorConstant{1.0})
   3.8%    90.0%       0.391s       1.63e-03s    240    22   Gemm{inplace}(Dot22.0, TensorConstant{1.0}, <TensorType(float32, matrix)>, Elemwise{mul}.0, TensorConstant{1.0})
   3.4%    93.5%       0.352s       1.47e-03s    240    21   Gemm{inplace}(Dot22.0, TensorConstant{1.0}, Elemwise{Mul}[(0, 1)].0, <TensorType(float32, matrix)>, TensorConstant{1.0})
   1.1%    94.6%       0.111s       4.64e-04s    240     7   Elemwise{Composite{(i0 + (i1 * i2 * i3 * i4) + (i1 * i5 * i2))}}(<TensorType(float32, matrix)>, <TensorType(int8, col)>, Gemm{no_inplace}.0, <TensorType(float32, matrix)>, Gemm{no_inplace}.0, <TensorType(float32, matrix)>)
   0.9%    95.4%       0.088s       3.65e-04s    240    31   Elemwise{Composite{(i0 + ((i1 + i2) * i3) + (i4 * i3))}}(<TensorType(float32, matrix)>, Gemm{inplace}.0, Gemm{inplace}.0, <TensorType(int8, col)>, Gemm{inplace}.0)
   0.7%    96.2%       0.075s       3.12e-04s    240    30   Elemwise{add,no_inplace}(<TensorType(float32, matrix)>, Gemm{inplace}.0)
   ... (remaining 12 Apply instances account for 3.85%(0.39s) of the runtime)

 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:330
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.309456e+01s
    Number of Apply nodes: 536
    Theano Optimizer time: 1.634476e+01s
       Theano validate time: 2.549255e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 6.595885e+00s
       Import time 7.425475e-02s

 Time in all call to theano.grad() 4.098411e-01s
 Time in all call to theano.grad() 4.098411e-01s
 Time in all call to theano.grad() 4.098411e-01s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:332
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 8.350207e+00s
    Number of Apply nodes: 134
    Theano Optimizer time: 1.283817e+00s
       Theano validate time: 5.622768e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 7.014123e+00s
       Import time 3.218102e-02s

 Time in all call to theano.grad() 4.098411e-01s
 Time in all call to theano.grad() 4.098411e-01s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:334
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 5.702252e+00s
    Number of Apply nodes: 152
    Theano Optimizer time: 1.817622e+00s
       Theano validate time: 7.331610e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 3.821592e+00s
       Import time 1.338577e-02s

 Time in all call to theano.grad() 4.098411e-01s
 Time in all call to theano.grad() 4.098411e-01s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:336
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 1.727933e+00s
    Number of Apply nodes: 163
    Theano Optimizer time: 1.428595e+00s
       Theano validate time: 7.478404e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.332819e-01s
       Import time 0.000000e+00s

 Time in all call to theano.grad() 4.098411e-01s
 Time in all call to theano.grad() 4.098411e-01s
 Function profiling
 ==================
  Message: Sum of all(5) printed profiles at exit excluding Scan op profile.
  Time in 8 calls to Function.__call__: 1.477107e+02s
  Time in Function.fn.__call__: 1.474529e+02s (99.825%)
  Time in thunks: 1.470501e+02s (99.553%)
  Total compile time: 1.766070e+02s
    Number of Apply nodes: 525
    Theano Optimizer time: 4.273788e+01s
       Theano validate time: 7.118421e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 1.333998e+02s
       Import time 4.955947e-01s

 Time in all call to theano.grad() 4.098411e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  64.7%    64.7%      95.128s       9.91e-01s     C       96      12   theano.tensor.blas.Dot22
  11.5%    76.2%      16.857s       1.05e+00s     Py      16       2   theano.scan_module.scan_op.Scan
   9.5%    85.6%      13.954s       1.74e+00s     C        8       1   theano.tensor.nnet.nnet.Softmax
   7.9%    93.6%      11.631s       5.37e-03s     C     2168     271   theano.tensor.elemwise.Elemwise
   2.1%    95.7%       3.089s       3.86e-01s     C        8       1   theano.tensor.blas.Dot22Scalar
   1.6%    97.3%       2.389s       3.32e-02s     C       72       9   theano.tensor.elemwise.Sum
   1.4%    98.7%       2.029s       2.54e-01s     Py       8       1   theano.tensor.subtensor.AdvancedIncSubtensor
   0.4%    99.1%       0.618s       7.72e-02s     C        8       1   theano.tensor.nnet.nnet.SoftmaxGrad
   0.3%    99.4%       0.454s       1.89e-02s     Py      24       3   theano.tensor.subtensor.AdvancedSubtensor
   0.3%    99.7%       0.453s       1.95e-03s     C      232      29   theano.tensor.basic.Reshape
   0.2%    99.9%       0.278s       2.32e-03s     C      120      15   theano.tensor.basic.Alloc
   0.1%   100.0%       0.164s       2.28e-03s     C       72       9   theano.tensor.subtensor.IncSubtensor
   0.0%   100.0%       0.002s       2.15e-04s     Py       8       1   theano.tensor.basic.Nonzero
   0.0%   100.0%       0.001s       2.26e-06s     C      424      53   theano.compile.ops.Shape_i
   0.0%   100.0%       0.001s       3.05e-06s     C      264      33   theano.tensor.elemwise.DimShuffle
   0.0%   100.0%       0.001s       4.06e-06s     C      184      23   theano.tensor.opt.MakeVector
   0.0%   100.0%       0.001s       3.71e-06s     C      200      25   theano.tensor.subtensor.Subtensor
   0.0%   100.0%       0.000s       1.16e-06s     C      248      31   theano.tensor.basic.ScalarFromTensor
   0.0%   100.0%       0.000s       2.26e-05s     C        8       1   theano.tensor.basic.Join
   0.0%   100.0%       0.000s       2.40e-06s     C       24       3   theano.tensor.basic.Flatten
   ... (remaining 1 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  64.7%    64.7%      95.128s       9.91e-01s     C       96       12   Dot22
   9.5%    74.2%      13.954s       1.74e+00s     C        8        1   Softmax
   7.1%    81.3%      10.468s       1.31e+00s     Py       8        1   forall_inplace,cpu,grad_of_scan_fn}
   5.4%    86.7%       7.876s       9.85e-01s     C        8        1   Elemwise{Composite{(i0 * log((i1 / i2)))}}
   4.3%    91.0%       6.388s       7.99e-01s     Py       8        1   forall_inplace,cpu,scan_fn}
   2.1%    93.1%       3.089s       3.86e-01s     C        8        1   Dot22Scalar
   1.4%    94.5%       2.029s       2.54e-01s     Py       8        1   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}
   1.3%    95.8%       1.907s       4.77e-02s     C       40        5   Sum{axis=[0, 1], acc_dtype=float64}
   1.2%    97.0%       1.788s       2.24e-01s     C        8        1   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}
   0.4%    97.4%       0.618s       7.72e-02s     C        8        1   SoftmaxGrad
   0.4%    97.8%       0.557s       1.51e-03s     C      368       46   Elemwise{add,no_inplace}
   0.3%    98.1%       0.482s       2.01e-02s     C       24        3   Sum{axis=[1], acc_dtype=float64}
   0.3%    98.4%       0.454s       1.89e-02s     Py      24        3   AdvancedSubtensor
   0.3%    98.7%       0.453s       2.83e-03s     C      160       20   Reshape{2}
   0.3%    99.0%       0.418s       5.22e-03s     C       80       10   Elemwise{mul,no_inplace}
   0.2%    99.2%       0.318s       3.98e-02s     C        8        1   Elemwise{clip,no_inplace}
   0.2%    99.4%       0.278s       2.32e-03s     C      120       15   Alloc
   0.2%    99.6%       0.262s       8.19e-03s     C       32        4   Elemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))}}[(0, 0)]
   0.1%    99.7%       0.206s       1.84e-03s     C      112       14   Elemwise{Composite{(i0 * sqr(i1))}}
   0.1%    99.8%       0.101s       3.15e-03s     C       32        4   IncSubtensor{Inc;:int64:}
   ... (remaining 98 Ops account for   0.19%(0.27s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  11.8%    11.8%      17.349s       2.17e+00s      8    92   Dot22(Reshape{2}.0, Reshape{2}.0)
   9.5%    21.3%      13.954s       1.74e+00s      8   397   Softmax(Reshape{2}.0)
   9.1%    30.4%      13.341s       1.67e+00s      8   489   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   7.7%    38.1%      11.309s       1.41e+00s      8    93   Dot22(Reshape{2}.0, Reshape{2}.0)
   7.1%    45.2%      10.468s       1.31e+00s      8   446   forall_inplace,cpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Elemwise{mul,no_inplace}.0, Elemwise{mul,no_inplace}.0, InplaceDimShuffle{0,2,1}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{::int64}.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Elem
   6.6%    51.8%       9.739s       1.22e+00s      8    94   Dot22(Reshape{2}.0, Reshape{2}.0)
   6.5%    58.2%       9.491s       1.19e+00s      8    91   Dot22(Reshape{2}.0, Reshape{2}.0)
   5.6%    63.9%       8.252s       1.03e+00s      8   491   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   5.6%    69.5%       8.230s       1.03e+00s      8   433   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   5.4%    74.8%       7.876s       9.85e-01s      8   411   Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0)
   4.3%    79.2%       6.388s       7.99e-01s      8   337   forall_inplace,cpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Alloc.0, Alloc.0, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>)
   4.0%    83.2%       5.945s       7.43e-01s      8   485   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   3.0%    86.2%       4.401s       5.50e-01s      8   493   Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0)
   2.1%    88.3%       3.094s       3.87e-01s      8   486   Dot22(Flatten{2}.0, Reshape{2}.0)
   2.1%    90.4%       3.089s       3.86e-01s      8   487   Dot22Scalar(Flatten{2}.0, Reshape{2}.0, TensorConstant{0.0010000000475})
   1.4%    91.8%       2.092s       2.62e-01s      8   434   Dot22(Reshape{2}.0, InplaceDimShuffle{1,0}.0)
   1.4%    93.2%       2.029s       2.54e-01s      8   425   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0)
   1.3%    94.5%       1.885s       2.36e-01s      8   370   Dot22(Reshape{2}.0, Reshape{2}.0)
   1.2%    95.7%       1.788s       2.24e-01s      8   424   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0)
   1.1%    96.8%       1.561s       1.95e-01s      8   430   Sum{axis=[0, 1], acc_dtype=float64}(InplaceDimShuffle{1,0,2}.0)
   ... (remaining 505 Apply instances account for 3.24%(4.77s) of the runtime)


 training time 148.004755974
diff --git a/linear.log b/linear.log
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:328
  Time in 8 calls to Function.__call__: 1.782110e+01s
  Time in Function.fn.__call__: 1.747024e+01s (98.031%)
  Time in thunks: 1.711180e+01s (96.020%)
  Total compile time: 2.021389e+01s
    Number of Apply nodes: 530
    Theano Optimizer time: 1.830422e+01s
       Theano validate time: 2.977979e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 1.739421e+00s
       Import time 1.584115e-01s

 Time in all call to theano.grad() 4.073255e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  60.1%    60.1%      10.281s       6.15e-03s     C     1672     209   theano.tensor.elemwise.Elemwise
  12.1%    72.2%       2.077s       2.60e-01s     Py       8       1   theano.tensor.subtensor.AdvancedIncSubtensor
   5.1%    77.3%       0.874s       5.47e-02s     Py      16       2   theano.scan_module.scan_op.Scan
   4.2%    81.6%       0.725s       7.56e-03s     C       96      12   theano.sandbox.cuda.blas.GpuDot22
   3.7%    85.3%       0.634s       2.64e-02s     C       24       3   theano.sandbox.cuda.basic_ops.HostFromGpu
   3.6%    88.8%       0.610s       7.63e-02s     C        8       1   theano.tensor.nnet.nnet.SoftmaxGrad
   2.8%    91.6%       0.482s       1.51e-02s     C       32       4   theano.tensor.elemwise.Sum
   2.6%    94.2%       0.444s       1.85e-02s     Py      24       3   theano.tensor.subtensor.AdvancedSubtensor
   2.4%    96.7%       0.417s       1.74e-02s     C       24       3   theano.sandbox.cuda.basic_ops.GpuFromHost
   1.0%    97.7%       0.168s       1.05e-02s     C       16       2   theano.tensor.basic.Alloc
   0.7%    98.4%       0.124s       5.76e-04s     C      216      27   theano.sandbox.cuda.basic_ops.GpuReshape
   0.7%    99.1%       0.124s       2.45e-04s     C      504      63   theano.sandbox.cuda.basic_ops.GpuElemwise
   0.3%    99.4%       0.055s       1.38e-03s     C       40       5   theano.sandbox.cuda.basic_ops.GpuCAReduce
   0.2%    99.6%       0.028s       3.87e-04s     C       72       9   theano.sandbox.cuda.basic_ops.GpuIncSubtensor
   0.2%    99.8%       0.027s       2.62e-04s     C      104      13   theano.sandbox.cuda.basic_ops.GpuAlloc
   0.1%    99.8%       0.015s       1.92e-03s     C        8       1   theano.sandbox.cuda.nnet.GpuSoftmax
   0.1%    99.9%       0.012s       1.47e-03s     C        8       1   theano.sandbox.cuda.blas.GpuDot22Scalar
   0.1%   100.0%       0.009s       1.16e-03s     Py       8       1   theano.sandbox.cuda.basic_ops.GpuFlatten
   0.0%   100.0%       0.002s       2.42e-04s     Py       8       1   theano.tensor.basic.Nonzero
   0.0%   100.0%       0.001s       2.13e-06s     C      416      52   theano.compile.ops.Shape_i
   ... (remaining 10 Classes account for   0.02%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  46.1%    46.1%       7.889s       9.86e-01s     C        8        1   Elemwise{Composite{(i0 * log((i1 / i2)))}}
  12.1%    58.2%       2.077s       2.60e-01s     Py       8        1   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}
  10.4%    68.7%       1.783s       2.23e-01s     C        8        1   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}
   4.2%    72.9%       0.725s       7.56e-03s     C       96       12   GpuDot22
   4.1%    77.0%       0.704s       8.80e-02s     Py       8        1   forall_inplace,gpu,grad_of_scan_fn}
   3.7%    80.7%       0.634s       2.64e-02s     C       24        3   HostFromGpu
   3.6%    84.3%       0.610s       7.63e-02s     C        8        1   SoftmaxGrad
   2.8%    87.1%       0.482s       2.01e-02s     C       24        3   Sum{axis=[1], acc_dtype=float64}
   2.6%    89.7%       0.444s       1.85e-02s     Py      24        3   AdvancedSubtensor
   2.4%    92.1%       0.417s       1.74e-02s     C       24        3   GpuFromHost
   1.8%    94.0%       0.315s       3.94e-02s     C        8        1   Elemwise{clip,no_inplace}
   1.7%    95.7%       0.290s       1.21e-02s     C       24        3   Elemwise{mul,no_inplace}
   1.0%    96.7%       0.170s       2.13e-02s     Py       8        1   forall_inplace,gpu,scan_fn}
   1.0%    97.6%       0.168s       1.05e-02s     C       16        2   Alloc
   0.7%    98.4%       0.124s       8.16e-04s     C      152       19   GpuReshape{2}
   0.3%    98.7%       0.055s       1.38e-03s     C       40        5   GpuCAReduce{add}{1,1,0}
   0.2%    98.9%       0.030s       3.46e-04s     C       88       11   GpuElemwise{Add}[(0, 0)]
   0.2%    99.0%       0.027s       2.78e-04s     C       96       12   GpuAlloc{memset_0=True}
   0.1%    99.2%       0.025s       2.41e-04s     C      104       13   GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace}
   0.1%    99.3%       0.022s       3.85e-04s     C       56        7   GpuElemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))},no_inplace}
   ... (remaining 107 Ops account for   0.71%(0.12s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  46.1%    46.1%       7.889s       9.86e-01s      8   418   Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0)
  12.1%    58.2%       2.077s       2.60e-01s      8   421   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0)
  10.4%    68.7%       1.783s       2.23e-01s      8   419   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0)
   4.1%    72.8%       0.704s       8.80e-02s      8   451   forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{Mul}[(0, 1)].0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True}
   3.6%    76.3%       0.610s       7.63e-02s      8   425   SoftmaxGrad(Reshape{2}.0, HostFromGpu.0)
   1.9%    78.2%       0.319s       3.99e-02s      8   406   HostFromGpu(GpuReshape{3}.0)
   1.8%    80.0%       0.315s       3.94e-02s      8   414   Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0})
   1.8%    81.9%       0.314s       3.93e-02s      8   401   HostFromGpu(GpuSoftmax.0)
   1.7%    83.6%       0.290s       3.63e-02s      8   150   Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0)
   1.3%    84.9%       0.228s       2.84e-02s      8    75   AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0)
   1.3%    86.2%       0.216s       2.70e-02s      8   412   AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0)
   1.2%    87.4%       0.209s       2.61e-02s      8     6   GpuFromHost(<TensorType(float32, 3D)>)
   1.2%    88.6%       0.208s       2.60e-02s      8   427   GpuFromHost(SoftmaxGrad.0)
   1.0%    89.6%       0.170s       2.13e-02s      8   352   forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(
   1.0%    90.6%       0.167s       2.09e-02s      8   413   Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0)
   0.9%    91.5%       0.161s       2.01e-02s      8   420   Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0)
   0.9%    92.5%       0.161s       2.01e-02s      8   173   Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)
   0.9%    93.4%       0.161s       2.01e-02s      8   415   Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0)
   0.4%    93.8%       0.066s       8.26e-03s      8    94   GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0)
   0.4%    94.2%       0.065s       8.16e-03s      8   435   GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0)
   ... (remaining 510 Apply instances account for 5.84%(1.00s) of the runtime)


 Scan Op profiling ( scan_fn )
 ==================
  Message: None
  Time in 8 calls of the op (for a total of 240 steps) 1.655595e-01s

  Total time spent in calling the VM 1.615922e-01s (97.604%)
  Total overhead (computing slices..) 3.967285e-03s (2.396%)

 Time in all call to theano.grad() 4.073255e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  81.1%    81.1%       0.129s       1.35e-04s     C      960       4   theano.sandbox.cuda.blas.GpuGemm
  13.8%    95.0%       0.022s       3.06e-05s     C      720       3   theano.sandbox.cuda.basic_ops.GpuElemwise
   4.9%    99.8%       0.008s       3.23e-05s     C      240       1   theano.sandbox.cuda.basic_ops.GpuFromHost
   0.2%   100.0%       0.000s       1.12e-06s     C      240       1   theano.tensor.elemwise.Elemwise
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  81.1%    81.1%       0.129s       1.35e-04s     C      960        4   GpuGemm{no_inplace}
   8.6%    89.7%       0.014s       2.84e-05s     C      480        2   GpuElemwise{mul,no_inplace}
   5.3%    95.0%       0.008s       3.50e-05s     C      240        1   GpuElemwise{Composite{((i0 * (i1 * i2)) + (i3 * i4))},no_inplace}
   4.9%    99.8%       0.008s       3.23e-05s     C      240        1   GpuFromHost
   0.2%   100.0%       0.000s       1.12e-06s     C      240        1   Elemwise{Cast{float32}}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  20.7%    20.7%       0.033s       1.38e-04s    240     3   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
  20.5%    41.2%       0.033s       1.36e-04s    240     6   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
  20.0%    61.2%       0.032s       1.33e-04s    240     5   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
  19.9%    81.1%       0.032s       1.32e-04s    240     4   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   5.3%    86.4%       0.008s       3.50e-05s    240     7   GpuElemwise{Composite{((i0 * (i1 * i2)) + (i3 * i4))},no_inplace}(GpuGemm{no_inplace}.0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuGemm{no_inplace}.0, GpuGemm{no_inplace}.0)
   5.2%    91.6%       0.008s       3.45e-05s    240     2   GpuElemwise{mul,no_inplace}(GpuFromHost.0, <CudaNdarrayType(float32, matrix)>)
   4.9%    96.5%       0.008s       3.23e-05s    240     1   GpuFromHost(Elemwise{Cast{float32}}.0)
   3.4%    99.8%       0.005s       2.23e-05s    240     8   GpuElemwise{mul,no_inplace}(GpuGemm{no_inplace}.0, GpuElemwise{Composite{((i0 * (i1 * i2)) + (i3 * i4))},no_inplace}.0)
   0.2%   100.0%       0.000s       1.12e-06s    240     0   Elemwise{Cast{float32}}(<TensorType(int8, col)>)
   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)


 Scan Op profiling ( grad_of_scan_fn )
 ==================
  Message: None
  Time in 8 calls of the op (for a total of 240 steps) 6.921008e-01s

  Total time spent in calling the VM 6.207879e-01s (89.696%)
  Total overhead (computing slices..) 7.131290e-02s (10.304%)

 Time in all call to theano.grad() 4.073255e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  53.6%    53.6%       0.329s       1.25e-04s     C     2640      11   theano.sandbox.cuda.blas.GpuGemm
  24.6%    78.2%       0.151s       1.05e-04s     C     1440       6   theano.sandbox.cuda.blas.GpuDot22
  20.2%    98.4%       0.124s       3.24e-05s     C     3840      16   theano.sandbox.cuda.basic_ops.GpuElemwise
   1.5%    99.9%       0.009s       3.94e-05s     C      240       1   theano.sandbox.cuda.basic_ops.GpuFromHost
   0.1%   100.0%       0.000s       1.36e-06s     C      240       1   theano.tensor.elemwise.Elemwise
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  32.2%    32.2%       0.198s       1.18e-04s     C     1680        7   GpuGemm{inplace}
  24.6%    56.8%       0.151s       1.05e-04s     C     1440        6   GpuDot22
  21.4%    78.2%       0.131s       1.37e-04s     C      960        4   GpuGemm{no_inplace}
   8.6%    86.8%       0.053s       3.67e-05s     C     1440        6   GpuElemwise{add,no_inplace}
   4.2%    91.0%       0.026s       2.69e-05s     C      960        4   GpuElemwise{mul,no_inplace}
   2.8%    93.7%       0.017s       2.36e-05s     C      720        3   GpuElemwise{Mul}[(0, 1)]
   1.9%    95.6%       0.012s       4.86e-05s     C      240        1   GpuElemwise{Composite{((((i0 + i1) * i2) + (i3 * i2)) + i4)},no_inplace}
   1.9%    97.5%       0.012s       4.85e-05s     C      240        1   GpuElemwise{Composite{((((i0 * i1) * i2) + ((i3 * i1) * i2)) + i4)},no_inplace}
   1.5%    99.1%       0.009s       3.94e-05s     C      240        1   GpuFromHost
   0.9%    99.9%       0.005s       2.25e-05s     C      240        1   GpuElemwise{Mul}[(0, 0)]
   0.1%   100.0%       0.000s       1.36e-06s     C      240        1   Elemwise{Cast{float32}}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
   5.6%     5.6%       0.034s       1.43e-04s    240     4   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   5.3%    10.9%       0.033s       1.36e-04s    240     5   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   5.3%    16.2%       0.032s       1.35e-04s    240     2   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   5.2%    21.4%       0.032s       1.33e-04s    240     1   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.7%    26.1%       0.029s       1.21e-04s    240    28   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 0)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.7%    30.8%       0.029s       1.21e-04s    240    18   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.7%    35.5%       0.029s       1.20e-04s    240    24   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.7%    40.2%       0.029s       1.19e-04s    240    30   GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.6%    44.7%       0.028s       1.17e-04s    240    10   GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>)
   4.5%    49.2%       0.028s       1.15e-04s    240    21   GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>)
   4.5%    53.7%       0.028s       1.15e-04s    240    25   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0})
   4.5%    58.2%       0.027s       1.15e-04s    240    29   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0})
   4.5%    62.7%       0.027s       1.14e-04s    240    27   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0})
   4.4%    67.1%       0.027s       1.14e-04s    240     9   GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>)
   3.7%    70.8%       0.023s       9.53e-05s    240    17   GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0)
   3.7%    74.5%       0.023s       9.42e-05s    240    22   GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 0)].0)
   3.7%    78.2%       0.023s       9.40e-05s    240    20   GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0)
   2.0%    80.2%       0.012s       5.16e-05s    240    31   GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0)
   2.0%    82.2%       0.012s       5.06e-05s    240    32   GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0)
   2.0%    84.1%       0.012s       5.06e-05s    240    33   GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0)
   ... (remaining 15 Apply instances account for 15.86%(0.10s) of the runtime)

 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:330
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.060873e+01s
    Number of Apply nodes: 541
    Theano Optimizer time: 1.904841e+01s
       Theano validate time: 8.013768e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 1.383735e+00s
       Import time 1.165748e-02s

 Time in all call to theano.grad() 4.073255e-01s
 Time in all call to theano.grad() 4.073255e-01s
 Time in all call to theano.grad() 4.073255e-01s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:332
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 1.925702e+00s
    Number of Apply nodes: 137
    Theano Optimizer time: 1.554818e+00s
       Theano validate time: 5.621839e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 3.086841e-01s
       Import time 7.719040e-03s

 Time in all call to theano.grad() 4.073255e-01s
 Time in all call to theano.grad() 4.073255e-01s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:334
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.623655e+00s
    Number of Apply nodes: 155
    Theano Optimizer time: 2.232972e+00s
       Theano validate time: 7.306647e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 3.238871e-01s
       Import time 5.110025e-03s

 Time in all call to theano.grad() 4.073255e-01s
 Time in all call to theano.grad() 4.073255e-01s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:336
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.094579e+00s
    Number of Apply nodes: 166
    Theano Optimizer time: 1.699915e+00s
       Theano validate time: 7.587528e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 3.269272e-01s
       Import time 0.000000e+00s

 Time in all call to theano.grad() 4.073255e-01s
 Time in all call to theano.grad() 4.073255e-01s
 Function profiling
 ==================
  Message: Sum of all(5) printed profiles at exit excluding Scan op profile.
  Time in 8 calls to Function.__call__: 1.782110e+01s
  Time in Function.fn.__call__: 1.747024e+01s (98.031%)
  Time in thunks: 1.711180e+01s (96.020%)
  Total compile time: 4.746655e+01s
    Number of Apply nodes: 530
    Theano Optimizer time: 4.284033e+01s
       Theano validate time: 1.304335e+00s
    Theano Linker time (includes C, CUDA code generation/compiling): 4.082654e+00s
       Import time 1.828980e-01s

 Time in all call to theano.grad() 4.073255e-01s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  60.1%    60.1%      10.281s       6.15e-03s     C     1672     209   theano.tensor.elemwise.Elemwise
  12.1%    72.2%       2.077s       2.60e-01s     Py       8       1   theano.tensor.subtensor.AdvancedIncSubtensor
   5.1%    77.3%       0.874s       5.47e-02s     Py      16       2   theano.scan_module.scan_op.Scan
   4.2%    81.6%       0.725s       7.56e-03s     C       96      12   theano.sandbox.cuda.blas.GpuDot22
   3.7%    85.3%       0.634s       2.64e-02s     C       24       3   theano.sandbox.cuda.basic_ops.HostFromGpu
   3.6%    88.8%       0.610s       7.63e-02s     C        8       1   theano.tensor.nnet.nnet.SoftmaxGrad
   2.8%    91.6%       0.482s       1.51e-02s     C       32       4   theano.tensor.elemwise.Sum
   2.6%    94.2%       0.444s       1.85e-02s     Py      24       3   theano.tensor.subtensor.AdvancedSubtensor
   2.4%    96.7%       0.417s       1.74e-02s     C       24       3   theano.sandbox.cuda.basic_ops.GpuFromHost
   1.0%    97.7%       0.168s       1.05e-02s     C       16       2   theano.tensor.basic.Alloc
   0.7%    98.4%       0.124s       5.76e-04s     C      216      27   theano.sandbox.cuda.basic_ops.GpuReshape
   0.7%    99.1%       0.124s       2.45e-04s     C      504      63   theano.sandbox.cuda.basic_ops.GpuElemwise
   0.3%    99.4%       0.055s       1.38e-03s     C       40       5   theano.sandbox.cuda.basic_ops.GpuCAReduce
   0.2%    99.6%       0.028s       3.87e-04s     C       72       9   theano.sandbox.cuda.basic_ops.GpuIncSubtensor
   0.2%    99.8%       0.027s       2.62e-04s     C      104      13   theano.sandbox.cuda.basic_ops.GpuAlloc
   0.1%    99.8%       0.015s       1.92e-03s     C        8       1   theano.sandbox.cuda.nnet.GpuSoftmax
   0.1%    99.9%       0.012s       1.47e-03s     C        8       1   theano.sandbox.cuda.blas.GpuDot22Scalar
   0.1%   100.0%       0.009s       1.16e-03s     Py       8       1   theano.sandbox.cuda.basic_ops.GpuFlatten
   0.0%   100.0%       0.002s       2.42e-04s     Py       8       1   theano.tensor.basic.Nonzero
   0.0%   100.0%       0.001s       2.13e-06s     C      416      52   theano.compile.ops.Shape_i
   ... (remaining 10 Classes account for   0.02%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  46.1%    46.1%       7.889s       9.86e-01s     C        8        1   Elemwise{Composite{(i0 * log((i1 / i2)))}}
  12.1%    58.2%       2.077s       2.60e-01s     Py       8        1   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}
  10.4%    68.7%       1.783s       2.23e-01s     C        8        1   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}
   4.2%    72.9%       0.725s       7.56e-03s     C       96       12   GpuDot22
   4.1%    77.0%       0.704s       8.80e-02s     Py       8        1   forall_inplace,gpu,grad_of_scan_fn}
   3.7%    80.7%       0.634s       2.64e-02s     C       24        3   HostFromGpu
   3.6%    84.3%       0.610s       7.63e-02s     C        8        1   SoftmaxGrad
   2.8%    87.1%       0.482s       2.01e-02s     C       24        3   Sum{axis=[1], acc_dtype=float64}
   2.6%    89.7%       0.444s       1.85e-02s     Py      24        3   AdvancedSubtensor
   2.4%    92.1%       0.417s       1.74e-02s     C       24        3   GpuFromHost
   1.8%    94.0%       0.315s       3.94e-02s     C        8        1   Elemwise{clip,no_inplace}
   1.7%    95.7%       0.290s       1.21e-02s     C       24        3   Elemwise{mul,no_inplace}
   1.0%    96.7%       0.170s       2.13e-02s     Py       8        1   forall_inplace,gpu,scan_fn}
   1.0%    97.6%       0.168s       1.05e-02s     C       16        2   Alloc
   0.7%    98.4%       0.124s       8.16e-04s     C      152       19   GpuReshape{2}
   0.3%    98.7%       0.055s       1.38e-03s     C       40        5   GpuCAReduce{add}{1,1,0}
   0.2%    98.9%       0.030s       3.46e-04s     C       88       11   GpuElemwise{Add}[(0, 0)]
   0.2%    99.0%       0.027s       2.78e-04s     C       96       12   GpuAlloc{memset_0=True}
   0.1%    99.2%       0.025s       2.41e-04s     C      104       13   GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace}
   0.1%    99.3%       0.022s       3.85e-04s     C       56        7   GpuElemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))},no_inplace}
   ... (remaining 107 Ops account for   0.71%(0.12s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  46.1%    46.1%       7.889s       9.86e-01s      8   418   Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0)
  12.1%    58.2%       2.077s       2.60e-01s      8   421   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0)
  10.4%    68.7%       1.783s       2.23e-01s      8   419   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0)
   4.1%    72.8%       0.704s       8.80e-02s      8   451   forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{Mul}[(0, 1)].0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True}
   3.6%    76.3%       0.610s       7.63e-02s      8   425   SoftmaxGrad(Reshape{2}.0, HostFromGpu.0)
   1.9%    78.2%       0.319s       3.99e-02s      8   406   HostFromGpu(GpuReshape{3}.0)
   1.8%    80.0%       0.315s       3.94e-02s      8   414   Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0})
   1.8%    81.9%       0.314s       3.93e-02s      8   401   HostFromGpu(GpuSoftmax.0)
   1.7%    83.6%       0.290s       3.63e-02s      8   150   Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0)
   1.3%    84.9%       0.228s       2.84e-02s      8    75   AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0)
   1.3%    86.2%       0.216s       2.70e-02s      8   412   AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0)
   1.2%    87.4%       0.209s       2.61e-02s      8     6   GpuFromHost(<TensorType(float32, 3D)>)
   1.2%    88.6%       0.208s       2.60e-02s      8   427   GpuFromHost(SoftmaxGrad.0)
   1.0%    89.6%       0.170s       2.13e-02s      8   352   forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(
   1.0%    90.6%       0.167s       2.09e-02s      8   413   Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0)
   0.9%    91.5%       0.161s       2.01e-02s      8   420   Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0)
   0.9%    92.5%       0.161s       2.01e-02s      8   173   Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)
   0.9%    93.4%       0.161s       2.01e-02s      8   415   Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0)
   0.4%    93.8%       0.066s       8.26e-03s      8    94   GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0)
   0.4%    94.2%       0.065s       8.16e-03s      8   435   GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0)
   ... (remaining 510 Apply instances account for 5.84%(1.00s) of the runtime)


 training time 18.1833021641
diff --git a/profile.log b/profile.log
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:328
  Time in 8 calls to Function.__call__: 1.868278e+01s
  Time in Function.fn.__call__: 1.834628e+01s (98.199%)
  Time in thunks: 1.794744e+01s (96.064%)
  Total compile time: 2.445579e+01s
    Number of Apply nodes: 532
    Theano Optimizer time: 2.181430e+01s
       Theano validate time: 2.990005e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.419208e+00s
       Import time 1.909029e-01s

 Time in all call to theano.grad() 1.037111e+00s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  61.6%    61.6%      11.056s       6.61e-03s     C     1672     209   theano.tensor.elemwise.Elemwise
  11.4%    73.0%       2.045s       2.56e-01s     Py       8       1   theano.tensor.subtensor.AdvancedIncSubtensor
   5.6%    78.6%       0.999s       6.24e-02s     Py      16       2   theano.scan_module.scan_op.Scan
   4.2%    82.8%       0.753s       7.84e-03s     C       96      12   theano.sandbox.cuda.blas.GpuDot22
   3.6%    86.4%       0.647s       2.69e-02s     C       24       3   theano.sandbox.cuda.basic_ops.HostFromGpu
   3.4%    89.8%       0.613s       7.66e-02s     C        8       1   theano.tensor.nnet.nnet.SoftmaxGrad
   2.7%    92.5%       0.482s       1.51e-02s     C       32       4   theano.tensor.elemwise.Sum
   2.4%    94.9%       0.436s       1.82e-02s     Py      24       3   theano.tensor.subtensor.AdvancedSubtensor
   1.8%    96.7%       0.321s       1.34e-02s     C       24       3   theano.sandbox.cuda.basic_ops.GpuFromHost
   0.9%    97.6%       0.169s       1.06e-02s     C       16       2   theano.tensor.basic.Alloc
   0.7%    98.3%       0.129s       5.98e-04s     C      216      27   theano.sandbox.cuda.basic_ops.GpuReshape
   0.7%    99.1%       0.129s       2.48e-04s     C      520      65   theano.sandbox.cuda.basic_ops.GpuElemwise
   0.3%    99.4%       0.054s       1.35e-03s     C       40       5   theano.sandbox.cuda.basic_ops.GpuCAReduce
   0.2%    99.6%       0.044s       6.08e-04s     C       72       9   theano.sandbox.cuda.basic_ops.GpuIncSubtensor
   0.2%    99.8%       0.028s       2.67e-04s     C      104      13   theano.sandbox.cuda.basic_ops.GpuAlloc
   0.1%    99.9%       0.016s       1.99e-03s     C        8       1   theano.sandbox.cuda.nnet.GpuSoftmax
   0.1%    99.9%       0.012s       1.47e-03s     C        8       1   theano.sandbox.cuda.blas.GpuDot22Scalar
   0.1%   100.0%       0.009s       1.15e-03s     Py       8       1   theano.sandbox.cuda.basic_ops.GpuFlatten
   0.0%   100.0%       0.002s       2.06e-04s     Py       8       1   theano.tensor.basic.Nonzero
   0.0%   100.0%       0.001s       1.92e-06s     C      416      52   theano.compile.ops.Shape_i
   ... (remaining 10 Classes account for   0.01%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  48.1%    48.1%       8.638s       1.08e+00s     C        8        1   Elemwise{Composite{(i0 * log((i1 / i2)))}}
  11.4%    59.5%       2.045s       2.56e-01s     Py       8        1   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}
  10.0%    69.5%       1.796s       2.24e-01s     C        8        1   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}
   4.6%    74.1%       0.826s       1.03e-01s     Py       8        1   forall_inplace,gpu,grad_of_scan_fn}
   4.2%    78.3%       0.753s       7.84e-03s     C       96       12   GpuDot22
   3.6%    81.9%       0.647s       2.69e-02s     C       24        3   HostFromGpu
   3.4%    85.3%       0.613s       7.66e-02s     C        8        1   SoftmaxGrad
   2.7%    88.0%       0.482s       2.01e-02s     C       24        3   Sum{axis=[1], acc_dtype=float64}
   2.4%    90.5%       0.436s       1.82e-02s     Py      24        3   AdvancedSubtensor
   1.8%    92.3%       0.321s       1.34e-02s     C       24        3   GpuFromHost
   1.8%    94.0%       0.317s       3.96e-02s     C        8        1   Elemwise{clip,no_inplace}
   1.7%    95.7%       0.302s       1.26e-02s     C       24        3   Elemwise{mul,no_inplace}
   1.0%    96.7%       0.173s       2.17e-02s     Py       8        1   forall_inplace,gpu,scan_fn}
   0.9%    97.6%       0.169s       1.06e-02s     C       16        2   Alloc
   0.7%    98.3%       0.129s       8.48e-04s     C      152       19   GpuReshape{2}
   0.3%    98.6%       0.054s       1.35e-03s     C       40        5   GpuCAReduce{add}{1,1,0}
   0.2%    98.8%       0.035s       1.11e-03s     C       32        4   GpuIncSubtensor{Inc;:int64:}
   0.2%    99.0%       0.030s       3.46e-04s     C       88       11   GpuElemwise{Add}[(0, 0)]
   0.2%    99.1%       0.027s       2.84e-04s     C       96       12   GpuAlloc{memset_0=True}
   0.1%    99.3%       0.025s       2.39e-04s     C      104       13   GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace}
   ... (remaining 109 Ops account for   0.71%(0.13s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  48.1%    48.1%       8.638s       1.08e+00s      8   424   Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0)
  11.4%    59.5%       2.045s       2.56e-01s      8   427   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0)
  10.0%    69.5%       1.796s       2.24e-01s      8   425   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0)
   4.6%    74.1%       0.826s       1.03e-01s      8   453   forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{Tanh}[(0, 0)].0, GpuDimShuffle{0,2,1}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0
   3.4%    77.5%       0.613s       7.66e-02s      8   431   SoftmaxGrad(Reshape{2}.0, HostFromGpu.0)
   1.8%    79.4%       0.326s       4.07e-02s      8   406   HostFromGpu(GpuReshape{3}.0)
   1.8%    81.2%       0.320s       4.01e-02s      8   401   HostFromGpu(GpuSoftmax.0)
   1.8%    82.9%       0.317s       3.96e-02s      8   416   Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0})
   1.7%    84.6%       0.302s       3.77e-02s      8   150   Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0)
   1.2%    85.8%       0.218s       2.73e-02s      8    75   AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0)
   1.2%    87.0%       0.218s       2.72e-02s      8   412   AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0)
   1.0%    88.0%       0.173s       2.17e-02s      8   352   forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(
   0.9%    88.9%       0.169s       2.11e-02s      8   413   Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0)
   0.9%    89.9%       0.166s       2.07e-02s      8     6   GpuFromHost(<TensorType(float32, 3D)>)
   0.9%    90.8%       0.161s       2.01e-02s      8   173   Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)
   0.9%    91.6%       0.161s       2.01e-02s      8   426   Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0)
   0.9%    92.5%       0.161s       2.01e-02s      8   420   Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0)
   0.9%    93.4%       0.155s       1.93e-02s      8   433   GpuFromHost(SoftmaxGrad.0)
   0.5%    93.9%       0.092s       1.15e-02s      8    91   GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0)
   0.4%    94.3%       0.066s       8.25e-03s      8    94   GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0)
   ... (remaining 512 Apply instances account for 5.72%(1.03s) of the runtime)


 Scan Op profiling ( scan_fn )
 ==================
  Message: None
  Time in 8 calls of the op (for a total of 240 steps) 1.686399e-01s

  Total time spent in calling the VM 1.647773e-01s (97.710%)
  Total overhead (computing slices..) 3.862619e-03s (2.290%)

 Time in all call to theano.grad() 1.037111e+00s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  79.9%    79.9%       0.130s       1.35e-04s     C      960       4   theano.sandbox.cuda.blas.GpuGemm
  15.2%    95.1%       0.025s       3.44e-05s     C      720       3   theano.sandbox.cuda.basic_ops.GpuElemwise
   4.7%    99.8%       0.008s       3.18e-05s     C      240       1   theano.sandbox.cuda.basic_ops.GpuFromHost
   0.2%   100.0%       0.000s       1.09e-06s     C      240       1   theano.tensor.elemwise.Elemwise
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  79.9%    79.9%       0.130s       1.35e-04s     C      960        4   GpuGemm{no_inplace}
   5.9%    85.8%       0.010s       4.00e-05s     C      240        1   GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * (i4 * i5)) + (clip((i0 + i6), i2, i3) * tanh(i7)))},no_inplace}
   5.2%    91.0%       0.008s       3.50e-05s     C      240        1   GpuElemwise{mul,no_inplace}
   4.7%    95.7%       0.008s       3.18e-05s     C      240        1   GpuFromHost
   4.2%    99.8%       0.007s       2.81e-05s     C      240        1   GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace}
   0.2%   100.0%       0.000s       1.09e-06s     C      240        1   Elemwise{Cast{float32}}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  20.6%    20.6%       0.033s       1.39e-04s    240     3   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
  20.1%    40.7%       0.033s       1.36e-04s    240     6   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298})
  19.7%    60.4%       0.032s       1.33e-04s    240     5   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298})
  19.5%    79.9%       0.032s       1.32e-04s    240     4   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298})
   5.9%    85.8%       0.010s       4.00e-05s    240     7   GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * (i4 * i5)) + (clip((i0 + i6), i2, i3) * tanh(i7)))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuGemm{no_inplace}.0, GpuGemm{no_inplace}.0)
   5.2%    91.0%       0.008s       3.50e-05s    240     2   GpuElemwise{mul,no_inplace}(GpuFromHost.0, <CudaNdarrayType(float32, matrix)>)
   4.7%    95.7%       0.008s       3.18e-05s    240     1   GpuFromHost(Elemwise{Cast{float32}}.0)
   4.2%    99.8%       0.007s       2.81e-05s    240     8   GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * (i4 * i5)) + (clip((i0 + i6), i2, i3) * tanh(i7)))},no_inplace}.0)
   0.2%   100.0%       0.000s       1.09e-06s    240     0   Elemwise{Cast{float32}}(<TensorType(int8, col)>)
   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)


 Scan Op profiling ( grad_of_scan_fn )
 ==================
  Message: None
  Time in 8 calls of the op (for a total of 240 steps) 8.120770e-01s

  Total time spent in calling the VM 6.931341e-01s (85.353%)
  Total overhead (computing slices..) 1.189430e-01s (14.647%)

 Time in all call to theano.grad() 1.037111e+00s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  49.0%    49.0%       0.336s       1.27e-04s     C     2640      11   theano.sandbox.cuda.blas.GpuGemm
  27.1%    76.1%       0.185s       2.97e-05s     C     6240      26   theano.sandbox.cuda.basic_ops.GpuElemwise
  22.4%    98.5%       0.154s       1.07e-04s     C     1440       6   theano.sandbox.cuda.blas.GpuDot22
   1.4%   100.0%       0.010s       4.07e-05s     C      240       1   theano.sandbox.cuda.basic_ops.GpuFromHost
   0.0%   100.0%       0.000s       1.42e-06s     C      240       1   theano.tensor.elemwise.Elemwise
   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  29.3%    29.3%       0.201s       1.19e-04s     C     1680        7   GpuGemm{inplace}
  22.4%    51.7%       0.154s       1.07e-04s     C     1440        6   GpuDot22
  19.7%    71.4%       0.135s       1.41e-04s     C      960        4   GpuGemm{no_inplace}
   8.1%    79.5%       0.055s       3.83e-05s     C     1440        6   GpuElemwise{add,no_inplace}
   2.2%    81.7%       0.015s       2.10e-05s     C      720        3   GpuElemwise{Add}[(0, 1)]
   2.0%    83.7%       0.014s       2.91e-05s     C      480        2   GpuElemwise{mul,no_inplace}
   2.0%    85.7%       0.014s       2.83e-05s     C      480        2   GpuElemwise{Mul}[(0, 1)]
   1.8%    87.5%       0.012s       5.01e-05s     C      240        1   GpuElemwise{Composite{((((i0 * clip(i1, i2, i3)) * i4) + ((i5 * clip(i1, i2, i3)) * i4)) + i6)},no_inplace}
   1.7%    89.2%       0.012s       2.47e-05s     C      480        2   GpuElemwise{Clip}[(0, 0)]
   1.7%    90.9%       0.011s       4.78e-05s     C      240        1   GpuElemwise{Composite{((((i0 + i1) * i2) + (i3 * i2)) + i4)},no_inplace}
   1.5%    92.4%       0.010s       2.12e-05s     C      480        2   GpuElemwise{Composite{Cast{float32}(AND(GE(i0, i1), LE(i0, i2)))},no_inplace}
   1.4%    93.8%       0.010s       4.07e-05s     C      240        1   GpuFromHost
   1.1%    94.9%       0.008s       3.15e-05s     C      240        1   GpuElemwise{Composite{((i0 * i1) * i2)},no_inplace}
   1.1%    96.0%       0.007s       3.08e-05s     C      240        1   GpuElemwise{Mul}[(0, 3)]
   0.9%    96.9%       0.006s       2.67e-05s     C      240        1   GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)]
   0.8%    97.7%       0.005s       2.29e-05s     C      240        1   GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)]
   0.8%    98.5%       0.005s       2.27e-05s     C      240        1   GpuElemwise{Tanh}[(0, 0)]
   0.8%    99.3%       0.005s       2.15e-05s     C      240        1   GpuElemwise{Composite{Cast{float32}(AND(GE(i0, i1), LE(i0, i2)))}}[(0, 0)]
   0.7%   100.0%       0.005s       1.94e-05s     C      240        1   GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)]
   0.0%   100.0%       0.000s       1.42e-06s     C      240        1   Elemwise{Cast{float32}}
   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
   5.4%     5.4%       0.037s       1.53e-04s    240     2   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298})
   5.0%    10.3%       0.034s       1.41e-04s    240     4   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298})
   4.7%    15.0%       0.032s       1.35e-04s    240     1   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298})
   4.7%    19.7%       0.032s       1.34e-04s    240     3   GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.4%    24.1%       0.030s       1.24e-04s    240    24   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.3%    28.4%       0.029s       1.23e-04s    240    34   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.3%    32.6%       0.029s       1.22e-04s    240    38   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.2%    36.9%       0.029s       1.21e-04s    240    29   GpuDot22(GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>)
   4.2%    41.1%       0.029s       1.20e-04s    240    41   GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0})
   4.1%    45.2%       0.028s       1.16e-04s    240    16   GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>)
   4.1%    49.2%       0.028s       1.16e-04s    240    27   GpuDot22(GpuElemwise{Mul}[(0, 3)].0, <CudaNdarrayType(float32, matrix)>)
   4.1%    53.3%       0.028s       1.16e-04s    240    33   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0})
   4.1%    57.3%       0.028s       1.16e-04s    240    31   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0})
   4.0%    61.4%       0.028s       1.15e-04s    240    42   GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, TensorConstant{1.0})
   3.4%    64.8%       0.023s       9.66e-05s    240    23   GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0)
   3.3%    68.1%       0.023s       9.52e-05s    240    26   GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 3)].0)
   3.3%    71.4%       0.023s       9.51e-05s    240    37   GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0)
   1.8%    73.3%       0.013s       5.24e-05s    240    36   GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0)
   1.8%    75.1%       0.012s       5.14e-05s    240    44   GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0)
   1.8%    76.9%       0.012s       5.11e-05s    240    39   GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0)
   ... (remaining 25 Apply instances account for 23.13%(0.16s) of the runtime)

 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:330
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.442388e+01s
    Number of Apply nodes: 543
    Theano Optimizer time: 2.154786e+01s
       Theano validate time: 3.120501e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 2.651706e+00s
       Import time 1.034188e-02s

 Time in all call to theano.grad() 1.037111e+00s
 Time in all call to theano.grad() 1.037111e+00s
 Time in all call to theano.grad() 1.037111e+00s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:332
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.318422e+00s
    Number of Apply nodes: 137
    Theano Optimizer time: 1.716493e+00s
       Theano validate time: 5.609584e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 4.905031e-01s
       Import time 7.891417e-03s

 Time in all call to theano.grad() 1.037111e+00s
 Time in all call to theano.grad() 1.037111e+00s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:334
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 2.467041e+00s
    Number of Apply nodes: 155
    Theano Optimizer time: 1.826469e+00s
       Theano validate time: 7.342625e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 5.233629e-01s
       Import time 5.436182e-03s

 Time in all call to theano.grad() 1.037111e+00s
 Time in all call to theano.grad() 1.037111e+00s
 Function profiling
 ==================
  Message: build/bdist.linux-x86_64/egg/keras/models.py:336
  Time in 0 calls to Function.__call__: 0.000000e+00s
  Total compile time: 3.259403e+00s
    Number of Apply nodes: 166
    Theano Optimizer time: 2.627834e+00s
       Theano validate time: 7.786059e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 5.099900e-01s
       Import time 0.000000e+00s

 Time in all call to theano.grad() 1.037111e+00s
 Time in all call to theano.grad() 1.037111e+00s
 Function profiling
 ==================
  Message: Sum of all(5) printed profiles at exit excluding Scan op profile.
  Time in 8 calls to Function.__call__: 1.868278e+01s
  Time in Function.fn.__call__: 1.834628e+01s (98.199%)
  Time in thunks: 1.794744e+01s (96.064%)
  Total compile time: 5.692453e+01s
    Number of Apply nodes: 532
    Theano Optimizer time: 4.953296e+01s
       Theano validate time: 8.184333e-01s
    Theano Linker time (includes C, CUDA code generation/compiling): 6.594770e+00s
       Import time 2.145724e-01s

 Time in all call to theano.grad() 1.037111e+00s
 Class
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
  61.6%    61.6%      11.056s       6.61e-03s     C     1672     209   theano.tensor.elemwise.Elemwise
  11.4%    73.0%       2.045s       2.56e-01s     Py       8       1   theano.tensor.subtensor.AdvancedIncSubtensor
   5.6%    78.6%       0.999s       6.24e-02s     Py      16       2   theano.scan_module.scan_op.Scan
   4.2%    82.8%       0.753s       7.84e-03s     C       96      12   theano.sandbox.cuda.blas.GpuDot22
   3.6%    86.4%       0.647s       2.69e-02s     C       24       3   theano.sandbox.cuda.basic_ops.HostFromGpu
   3.4%    89.8%       0.613s       7.66e-02s     C        8       1   theano.tensor.nnet.nnet.SoftmaxGrad
   2.7%    92.5%       0.482s       1.51e-02s     C       32       4   theano.tensor.elemwise.Sum
   2.4%    94.9%       0.436s       1.82e-02s     Py      24       3   theano.tensor.subtensor.AdvancedSubtensor
   1.8%    96.7%       0.321s       1.34e-02s     C       24       3   theano.sandbox.cuda.basic_ops.GpuFromHost
   0.9%    97.6%       0.169s       1.06e-02s     C       16       2   theano.tensor.basic.Alloc
   0.7%    98.3%       0.129s       5.98e-04s     C      216      27   theano.sandbox.cuda.basic_ops.GpuReshape
   0.7%    99.1%       0.129s       2.48e-04s     C      520      65   theano.sandbox.cuda.basic_ops.GpuElemwise
   0.3%    99.4%       0.054s       1.35e-03s     C       40       5   theano.sandbox.cuda.basic_ops.GpuCAReduce
   0.2%    99.6%       0.044s       6.08e-04s     C       72       9   theano.sandbox.cuda.basic_ops.GpuIncSubtensor
   0.2%    99.8%       0.028s       2.67e-04s     C      104      13   theano.sandbox.cuda.basic_ops.GpuAlloc
   0.1%    99.9%       0.016s       1.99e-03s     C        8       1   theano.sandbox.cuda.nnet.GpuSoftmax
   0.1%    99.9%       0.012s       1.47e-03s     C        8       1   theano.sandbox.cuda.blas.GpuDot22Scalar
   0.1%   100.0%       0.009s       1.15e-03s     Py       8       1   theano.sandbox.cuda.basic_ops.GpuFlatten
   0.0%   100.0%       0.002s       2.06e-04s     Py       8       1   theano.tensor.basic.Nonzero
   0.0%   100.0%       0.001s       1.92e-06s     C      416      52   theano.compile.ops.Shape_i
   ... (remaining 10 Classes account for   0.01%(0.00s) of the runtime)

 Ops
 ---
 <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
  48.1%    48.1%       8.638s       1.08e+00s     C        8        1   Elemwise{Composite{(i0 * log((i1 / i2)))}}
  11.4%    59.5%       2.045s       2.56e-01s     Py       8        1   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}
  10.0%    69.5%       1.796s       2.24e-01s     C        8        1   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}
   4.6%    74.1%       0.826s       1.03e-01s     Py       8        1   forall_inplace,gpu,grad_of_scan_fn}
   4.2%    78.3%       0.753s       7.84e-03s     C       96       12   GpuDot22
   3.6%    81.9%       0.647s       2.69e-02s     C       24        3   HostFromGpu
   3.4%    85.3%       0.613s       7.66e-02s     C        8        1   SoftmaxGrad
   2.7%    88.0%       0.482s       2.01e-02s     C       24        3   Sum{axis=[1], acc_dtype=float64}
   2.4%    90.5%       0.436s       1.82e-02s     Py      24        3   AdvancedSubtensor
   1.8%    92.3%       0.321s       1.34e-02s     C       24        3   GpuFromHost
   1.8%    94.0%       0.317s       3.96e-02s     C        8        1   Elemwise{clip,no_inplace}
   1.7%    95.7%       0.302s       1.26e-02s     C       24        3   Elemwise{mul,no_inplace}
   1.0%    96.7%       0.173s       2.17e-02s     Py       8        1   forall_inplace,gpu,scan_fn}
   0.9%    97.6%       0.169s       1.06e-02s     C       16        2   Alloc
   0.7%    98.3%       0.129s       8.48e-04s     C      152       19   GpuReshape{2}
   0.3%    98.6%       0.054s       1.35e-03s     C       40        5   GpuCAReduce{add}{1,1,0}
   0.2%    98.8%       0.035s       1.11e-03s     C       32        4   GpuIncSubtensor{Inc;:int64:}
   0.2%    99.0%       0.030s       3.46e-04s     C       88       11   GpuElemwise{Add}[(0, 0)]
   0.2%    99.1%       0.027s       2.84e-04s     C       96       12   GpuAlloc{memset_0=True}
   0.1%    99.3%       0.025s       2.39e-04s     C      104       13   GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace}
   ... (remaining 109 Ops account for   0.71%(0.13s) of the runtime)

 Apply
 ------
 <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
  48.1%    48.1%       8.638s       1.08e+00s      8   424   Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0)
  11.4%    59.5%       2.045s       2.56e-01s      8   427   AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0)
  10.0%    69.5%       1.796s       2.24e-01s      8   425   Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0)
   4.6%    74.1%       0.826s       1.03e-01s      8   453   forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{Tanh}[(0, 0)].0, GpuDimShuffle{0,2,1}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0
   3.4%    77.5%       0.613s       7.66e-02s      8   431   SoftmaxGrad(Reshape{2}.0, HostFromGpu.0)
   1.8%    79.4%       0.326s       4.07e-02s      8   406   HostFromGpu(GpuReshape{3}.0)
   1.8%    81.2%       0.320s       4.01e-02s      8   401   HostFromGpu(GpuSoftmax.0)
   1.8%    82.9%       0.317s       3.96e-02s      8   416   Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0})
   1.7%    84.6%       0.302s       3.77e-02s      8   150   Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0)
   1.2%    85.8%       0.218s       2.73e-02s      8    75   AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0)
   1.2%    87.0%       0.218s       2.72e-02s      8   412   AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0)
   1.0%    88.0%       0.173s       2.17e-02s      8   352   forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(
   0.9%    88.9%       0.169s       2.11e-02s      8   413   Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0)
   0.9%    89.9%       0.166s       2.07e-02s      8     6   GpuFromHost(<TensorType(float32, 3D)>)
   0.9%    90.8%       0.161s       2.01e-02s      8   173   Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)
   0.9%    91.6%       0.161s       2.01e-02s      8   426   Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0)
   0.9%    92.5%       0.161s       2.01e-02s      8   420   Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0)
   0.9%    93.4%       0.155s       1.93e-02s      8   433   GpuFromHost(SoftmaxGrad.0)
   0.5%    93.9%       0.092s       1.15e-02s      8    91   GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0)
   0.4%    94.3%       0.066s       8.25e-03s      8    94   GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0)
   ... (remaining 512 Apply instances account for 5.72%(1.03s) of the runtime)


 training time 19.0255179405