diff --git a/example/is_odd.cr b/example/is_odd.cr
index 7476ab9..1178852 100644
--- a/example/is_odd.cr
+++ b/example/is_odd.cr
@@ -5,13 +5,19 @@ include Gradnite
 # BIT_SIZE is our input layer size
 BIT_SIZE = 8
 
+puts "running with #{BIT_SIZE} bits"
+
 # Layer size is approximately 2/3 of the input size
 LAYER_SIZE = (BIT_SIZE * 2/3).ceil.to_i
 
 mlp = MLP.new(BIT_SIZE, [LAYER_SIZE, LAYER_SIZE, 1])
 
-max = 255
-nums = (1..max).to_a
+# Generate all the numbers that can be represented by BIT_SIZE bits
+max = (2 ** BIT_SIZE) - 1
+
+puts "building training set of numbers with a ceiling of #{max}"
+
+nums = (0..max).to_a
 
 def num_to_binary_array(n)
   BIT_SIZE.times.map { |bit|
@@ -30,7 +36,7 @@ ypred = [] of Node
 
 loss = Node.new(0.0)
 
-epochs = 100
+epochs = 50
 
 epochs.times do |k|
   # forward pass
@@ -49,6 +55,12 @@ epochs.times do |k|
   }
   loss.backward
 
+  if loss.value < 0.0001
+    puts "loss: #{loss.value}"
+    puts "converged at epoch #{k}"
+    break
+  end
+
   # Gradient descent. Nudge all the parameters in the opposite direction of the gradient.
   # The gradient is showing us the direction that increases the loss, so we want to go the opposite way.
   # Linear decay of learning rate
@@ -68,8 +80,8 @@ def is_odd?(n, mlp)
   return result > 0.0
 end
 
-puts is_odd?(201, mlp)
-puts is_odd?(202, mlp)
-puts is_odd?(203, mlp)
+puts "201 true - #{is_odd?(201, mlp)}"
+puts "202 false - #{is_odd?(202, mlp)}"
+puts "203 true - #{is_odd?(203, mlp)}"
 
 puts "done"
diff --git a/src/gradnite/gradnite.cr b/src/gradnite/gradnite.cr
index 30c538c..0ebeeef 100644
--- a/src/gradnite/gradnite.cr
+++ b/src/gradnite/gradnite.cr
@@ -130,8 +130,8 @@ module Gradnite
     property bias : Node
 
     def initialize(input_count : Int64)
-      @weights = Array.new(input_count) { Node.new(rand) }
-      @bias = Node.new(rand)
+      @weights = Array.new(input_count) { Node.new(0.01 * rand) }
+      @bias = Node.new(0.0)
     end
 
     def run(x : Array(Float64) | Array(Node))