Merge pull request BVLC#816 from shelhamer/pycaffe-labels-grayscale-a…

…ttrs-examples Improve and polish pycaffe
torrvision · Aug 6, 2014 · 52d7a48 · 52d7a48
2 parents d1d499d + 0db9478
commit 52d7a48
Show file tree

Hide file tree

Showing 11 changed files with 257 additions and 196 deletions.
diff --git a/examples/detection.ipynb b/examples/detection.ipynb
@@ -36,7 +36,7 @@
      "input": [
       "!mkdir -p _temp\n",
       "!echo `pwd`/images/fish-bike.jpg > _temp/det_input.txt\n",
-      "!../python/detect.py --crop_mode=selective_search --pretrained_model=imagenet/caffe_rcnn_imagenet_model --model_def=imagenet/rcnn_imagenet_deploy.prototxt --gpu _temp/det_input.txt _temp/det_output.h5"
+      "!../python/detect.py --crop_mode=selective_search --pretrained_model=imagenet/caffe_rcnn_imagenet_model --model_def=imagenet/rcnn_imagenet_deploy.prototxt --gpu --raw_scale=255 _temp/det_input.txt _temp/det_output.h5"
      ],
      "language": "python",
      "metadata": {},

diff --git a/examples/filter_visualization.ipynb b/examples/filter_visualization.ipynb
diff --git a/examples/imagenet_classification.ipynb b/examples/imagenet_classification.ipynb
diff --git a/examples/net_surgery.ipynb b/examples/net_surgery.ipynb
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
@@ -25,6 +25,7 @@
 
 
 using namespace caffe;  // NOLINT(build/namespaces)
+using boost::python::dict;
 using boost::python::extract;
 using boost::python::len;
 using boost::python::list;
@@ -274,6 +275,11 @@ struct CaffeNet {
 
   // The pointer to the internal caffe::Net instant.
   shared_ptr<Net<float> > net_;
+  // Input preprocessing configuration attributes.
+  dict mean_;
+  dict input_scale_;
+  dict raw_scale_;
+  dict channel_swap_;
   // if taking input from an ndarray, we need to hold references
   object input_data_;
   object input_labels_;
@@ -311,19 +317,23 @@ BOOST_PYTHON_MODULE(_caffe) {
   boost::python::class_<CaffeNet, shared_ptr<CaffeNet> >(
       "Net", boost::python::init<string, string>())
       .def(boost::python::init<string>())
-      .def("_forward",          &CaffeNet::Forward)
-      .def("_backward",         &CaffeNet::Backward)
-      .def("set_mode_cpu",      &CaffeNet::set_mode_cpu)
-      .def("set_mode_gpu",      &CaffeNet::set_mode_gpu)
-      .def("set_phase_train",   &CaffeNet::set_phase_train)
-      .def("set_phase_test",    &CaffeNet::set_phase_test)
-      .def("set_device",        &CaffeNet::set_device)
-      .add_property("_blobs",   &CaffeNet::blobs)
-      .add_property("layers",   &CaffeNet::layers)
-      .add_property("inputs",   &CaffeNet::inputs)
-      .add_property("outputs",  &CaffeNet::outputs)
-      .def("_set_input_arrays", &CaffeNet::set_input_arrays)
-      .def("save",              &CaffeNet::save);
+      .def("_forward",              &CaffeNet::Forward)
+      .def("_backward",             &CaffeNet::Backward)
+      .def("set_mode_cpu",          &CaffeNet::set_mode_cpu)
+      .def("set_mode_gpu",          &CaffeNet::set_mode_gpu)
+      .def("set_phase_train",       &CaffeNet::set_phase_train)
+      .def("set_phase_test",        &CaffeNet::set_phase_test)
+      .def("set_device",            &CaffeNet::set_device)
+      .add_property("_blobs",       &CaffeNet::blobs)
+      .add_property("layers",       &CaffeNet::layers)
+      .add_property("inputs",       &CaffeNet::inputs)
+      .add_property("outputs",      &CaffeNet::outputs)
+      .add_property("mean",         &CaffeNet::mean_)
+      .add_property("input_scale",  &CaffeNet::input_scale_)
+      .add_property("raw_scale",    &CaffeNet::raw_scale_)
+      .add_property("channel_swap", &CaffeNet::channel_swap_)
+      .def("_set_input_arrays",     &CaffeNet::set_input_arrays)
+      .def("save",                  &CaffeNet::save);
 
   boost::python::class_<CaffeBlob, CaffeBlobWrap>(
       "Blob", boost::python::no_init)

diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
@@ -14,13 +14,14 @@ class Classifier(caffe.Net):
     by scaling, center cropping, or oversampling.
     """
     def __init__(self, model_file, pretrained_file, image_dims=None,
-                 gpu=False, mean_file=None, input_scale=None, channel_swap=None):
+                 gpu=False, mean=None, input_scale=None, raw_scale=None,
+                 channel_swap=None):
         """
         Take
         image_dims: dimensions to scale input for cropping/sampling.
-                    Default is to scale to net input size for whole-image crop.
-        gpu, mean_file, input_scale, channel_swap: convenience params for
-            setting mode, mean, input scale, and channel order.
+            Default is to scale to net input size for whole-image crop.
+        gpu, mean, input_scale, raw_scale, channel_swap: params for
+            preprocessing options.
         """
         caffe.Net.__init__(self, model_file, pretrained_file)
         self.set_phase_test()
@@ -30,11 +31,13 @@ def __init__(self, model_file, pretrained_file, image_dims=None,
         else:
             self.set_mode_cpu()
 
-        if mean_file:
-            self.set_mean(self.inputs[0], mean_file)
-        if input_scale:
+        if mean is not None:
+            self.set_mean(self.inputs[0], mean)
+        if input_scale is not None:
             self.set_input_scale(self.inputs[0], input_scale)
-        if channel_swap:
+        if raw_scale is not None:
+            self.set_raw_scale(self.inputs[0], raw_scale)
+        if channel_swap is not None:
             self.set_channel_swap(self.inputs[0], channel_swap)
 
         self.crop_dims = np.array(self.blobs[self.inputs[0]].data.shape[2:])
@@ -57,24 +60,29 @@ def predict(self, inputs, oversample=True):
                      for N images and C classes.
         """
         # Scale to standardize input dimensions.
-        inputs = np.asarray([caffe.io.resize_image(im, self.image_dims)
-                             for im in inputs])
+        input_ = np.zeros((len(inputs),
+            self.image_dims[0], self.image_dims[1], inputs[0].shape[2]),
+            dtype=np.float32)
+        for ix, in_ in enumerate(inputs):
+            input_[ix] = caffe.io.resize_image(in_, self.image_dims)
 
         if oversample:
             # Generate center, corner, and mirrored crops.
-            inputs = caffe.io.oversample(inputs, self.crop_dims)
+            input_ = caffe.io.oversample(input_, self.crop_dims)
         else:
             # Take center crop.
             center = np.array(self.image_dims) / 2.0
             crop = np.tile(center, (1, 2))[0] + np.concatenate([
                 -self.crop_dims / 2.0,
                 self.crop_dims / 2.0
             ])
-            inputs = inputs[:, crop[0]:crop[2], crop[1]:crop[3], :]
+            input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]
 
         # Classify
-        caffe_in = np.asarray([self.preprocess(self.inputs[0], in_)
-                    for in_ in inputs])
+        caffe_in = np.zeros(np.array(input_.shape)[[0,3,1,2]],
+                            dtype=np.float32)
+        for ix, in_ in enumerate(input_):
+            caffe_in[ix] = self.preprocess(self.inputs[0], in_)
         out = self.forward_all(**{self.inputs[0]: caffe_in})
         predictions = out[self.outputs[0]].squeeze(axis=(2,3))
 

diff --git a/python/caffe/detector.py b/python/caffe/detector.py
@@ -24,12 +24,13 @@ class Detector(caffe.Net):
     Detector extends Net for windowed detection by a list of crops or
     selective search proposals.
     """
-    def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
-                 input_scale=None, channel_swap=None, context_pad=None):
+    def __init__(self, model_file, pretrained_file, gpu=False, mean=None,
+                 input_scale=None, raw_scale=None, channel_swap=None,
+                 context_pad=None):
         """
         Take
-        gpu, mean_file, input_scale, channel_swap: convenience params for
-            setting mode, mean, input scale, and channel order.
+        gpu, mean, input_scale, raw_scale, channel_swap: params for
+            preprocessing options.
         context_pad: amount of surrounding context to take s.t. a `context_pad`
             sized border of pixels in the network input image is context, as in
             R-CNN feature extraction.
@@ -42,11 +43,13 @@ def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
         else:
             self.set_mode_cpu()
 
-        if mean_file:
-            self.set_mean(self.inputs[0], mean_file)
-        if input_scale:
+        if mean is not None:
+            self.set_mean(self.inputs[0], mean)
+        if input_scale is not None:
             self.set_input_scale(self.inputs[0], input_scale)
-        if channel_swap:
+        if raw_scale is not None:
+            self.set_raw_scale(self.inputs[0], raw_scale)
+        if channel_swap is not None:
             self.set_channel_swap(self.inputs[0], channel_swap)
 
         self.configure_crop(context_pad)
@@ -73,8 +76,11 @@ def detect_windows(self, images_windows):
                 window_inputs.append(self.crop(image, window))
 
         # Run through the net (warping windows to input dimensions).
-        caffe_in = np.asarray([self.preprocess(self.inputs[0], window_in)
-                    for window_in in window_inputs])
+        caffe_in = np.zeros((len(window_inputs), window_inputs[0].shape[2])
+                            + self.blobs[self.inputs[0]].data.shape[2:],
+                            dtype=np.float32)
+        for ix, window_in in enumerate(window_inputs):
+            caffe_in[ix] = self.preprocess(self.inputs[0], window_in)
         out = self.forward_all(**{self.inputs[0]: caffe_in})
         predictions = out[self.outputs[0]].squeeze(axis=(2,3))
 
@@ -180,12 +186,19 @@ def configure_crop(self, context_pad):
         """
         self.context_pad = context_pad
         if self.context_pad:
-            input_scale = self.input_scale.get(self.inputs[0])
+            raw_scale = self.raw_scale.get(self.inputs[0])
             channel_order = self.channel_swap.get(self.inputs[0])
             # Padding context crops needs the mean in unprocessed input space.
-            self.crop_mean = self.mean[self.inputs[0]].copy()
-            self.crop_mean = self.crop_mean.transpose((1,2,0))
-            channel_order_inverse = [channel_order.index(i)
-                                     for i in range(self.crop_mean.shape[2])]
-            self.crop_mean = self.crop_mean[:,:, channel_order_inverse]
-            self.crop_mean /= input_scale
+            mean = self.mean.get(self.inputs[0])
+            if mean is not None:
+                crop_mean = mean.copy().transpose((1,2,0))
+                if channel_order is not None:
+                    channel_order_inverse = [channel_order.index(i)
+                                            for i in range(crop_mean.shape[2])]
+                    crop_mean = crop_mean[:,:, channel_order_inverse]
+                if raw_scale is not None:
+                    crop_mean /= raw_scale
+                self.crop_mean = crop_mean
+            else:
+                self.crop_mean = np.zeros(self.blobs[self.inputs[0]].data.shape,
+                                          dtype=np.float32)
diff --git a/python/caffe/io.py b/python/caffe/io.py
@@ -1,6 +1,7 @@
 import numpy as np
 import skimage.io
-import skimage.transform
+from scipy.ndimage import zoom
+from skimage.transform import resize
 
 from caffe.proto import caffe_pb2
 
@@ -15,7 +16,8 @@ def load_image(filename, color=True):
         loads as intensity (if image is already grayscale).
 
     Give
-    image: an image with type np.float32 of size (H x W x 3) in RGB or
+    image: an image with type np.float32 in range [0, 1]
+        of size (H x W x 3) in RGB or
         of size (H x W x 1) in grayscale.
     """
     img = skimage.img_as_float(skimage.io.imread(filename)).astype(np.float32)
@@ -40,7 +42,17 @@ def resize_image(im, new_dims, interp_order=1):
     Give
     im: resized ndarray with shape (new_dims[0], new_dims[1], K)
     """
-    return skimage.transform.resize(im, new_dims, order=interp_order)
+    if im.shape[-1] == 1 or im.shape[-1] == 3:
+        # skimage is fast but only understands {1,3} channel images in [0, 1].
+        im_min, im_max = im.min(), im.max()
+        im_std = (im - im_min) / (im_max - im_min)
+        resized_std = resize(im_std, new_dims, order=interp_order)
+        resized_im = resized_std * (im_max - im_min) + im_min
+    else:
+        # ndimage interpolates anything but more slowly.
+        scale = tuple(np.array(new_dims) / np.array(im.shape[:2]))
+        resized_im = zoom(im, scale + (1,), order=interp_order)
+    return resized_im.astype(np.float32)
 
 
 def oversample(images, crop_dims):