  • 原版github代码-caffe实现
  • tensorflow实现,相关版本较低,python2,本文根据此代码迁移到python3上面。
  • pytorch实现,但将骨干模型从goglenet改成了resnet,实验效果得到提升,但没公布预训练权重
  • 注意,有一个人体姿态检测的网络也叫做PoseNet,检索的时候注意不要弄混二者
  • PoseNet论文总结


  • Image Matching Challenge 2023
  • 大致任务:根据图片推算相机位姿,包括3*3的旋转矩阵和3维的位置矩阵
  • 数据描述:train_labels.csv
    • dataset:数据集名字
    • scene:场景
    • image_path:图像路径
    • rotation_matrix:3*3的旋转矩阵
    • translation_vector:3维的位置矩阵


  • 参考这里
  • 相互转换所需的函数:
#change to 四元数,https://zhuanlan.zhihu.com/p/45404840
def matrix2quaternion(m):
    w = ((np.trace(m) + 1) ** 0.5) / 2
    x = (m[2][1] - m[1][2]) / (4 * w)
    y = (m[0][2] - m[2][0]) / (4 * w)
    z = (m[1][0] - m[0][1]) / (4 * w)
    return w,x,y,z
def quaternion2matrix(q):
    w,x,y,z = q
    return np.array([[1-2*y*y-2*z*z, 2*x*y-2*z*w, 2*x*z+2*y*w],
             [2*x*y+2*z*w, 1-2*x*x-2*z*z, 2*y*z-2*x*w],
             [2*x*z-2*y*w, 2*y*z+2*x*w, 1-2*x*x-2*y*y]])


  • 需要根据rotation_matrix的数据计算出对应的四元数并存储到新列rotation_matrix_quaternion中:(使用列表推导式和map实现)
def m(a):
    a = a.split(';')
    a = [float(i) for i in a]
    A = np.array([[a[0],a[1],a[2]],
    return matrix2quaternion(A)

change_train_labels = 1
if change_train_labels:
    train_labels = pd.read_csv('/kaggle/input/image-matching-challenge-2023/train/train_labels.csv')
    train_labels['rotation_matrix_quaternion'] = [i for i in map(m,train_labels['rotation_matrix'])]
  • '/kaggle/input/image-matching-challenge-2023/train/train_labels.csv'读入,写入'/kaggle/working/my_train_labels.csv'


  • 构建神经网络类代码

def layer(op):
    '''Decorator for composable network layers.'''

    def layer_decorated(self, *args, **kwargs):
        # Automatically set a name if not provided.
        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
        # Figure out the layer inputs.
        if len(self.terminals) == 0:
            raise RuntimeError('No input variables found for layer %s.' % name)
        elif len(self.terminals) == 1:
            layer_input = self.terminals[0]
            layer_input = list(self.terminals)
        # Perform the operation and get the output.
        layer_output = op(self, layer_input, *args, **kwargs)
        # Add to layer LUT.
        self.layers[name] = layer_output
        # This output is now the input for the next layer.
        # Return self for chained calls.
        return self

    return layer_decorated

class Network(object):

    def __init__(self, inputs, trainable=True):
        # The input nodes for this network
        self.inputs = inputs
        # The current list of terminal nodes
        self.terminals = []
        # Mapping from layer names to layers
        self.layers = dict(inputs)
        # If true, the resulting variables are set as trainable
        self.trainable = trainable
        # Switch variable for dropout
        self.use_dropout = tf.placeholder_with_default(tf.constant(1.0),

    def setup(self):
        '''Construct the network. '''
        raise NotImplementedError('Must be implemented by the subclass.')

    def load(self, data_path, session, ignore_missing=False):
        '''Load network weights.
        data_path: The path to the numpy-serialized network weights
        session: The current TensorFlow session
        ignore_missing: If true, serialized weights for missing layers are ignored.
        data_dict = np.load(data_path,allow_pickle=True,encoding="latin1").item()
        for op_name in data_dict:
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in data_dict[op_name].items():
                        var = tf.get_variable(param_name)
                    except ValueError:
                        if not ignore_missing:

    def feed(self, *args):
        '''Set the input(s) for the next operation by replacing the terminal nodes.
        The arguments can be either layer names or the actual layers.
        assert len(args) != 0
        self.terminals = []
        for fed_layer in args:
            if isinstance(fed_layer, str):
                    fed_layer = self.layers[fed_layer]
                except KeyError:
                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
        return self

    def get_output(self):
        '''Returns the current network output.'''
        return self.terminals[-1]

    def get_unique_name(self, prefix):
        '''Returns an index-suffixed unique name for the given prefix.
        This is used for auto-generating layer names based on the type-prefix.
        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
        return '%s_%d' % (prefix, ident)

    def make_var(self, name, shape):
        '''Creates a new TensorFlow variable.'''
        return tf.get_variable(name, shape, trainable=self.trainable)

    def validate_padding(self, padding):
        '''Verifies that the padding is one of the supported ones.'''
        assert padding in ('SAME', 'VALID')

    def conv(self,
        # Verify that the padding is acceptable
        # Get the number of channels in the input
        c_i = input.get_shape()[-1]
        # Verify that the grouping parameter is valid
        assert c_i % group == 0
        assert c_o % group == 0
        # Convolution for a given input and kernel
        convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
        with tf.variable_scope(name) as scope:
            kernel = self.make_var('weights', shape=[k_h, k_w, int(int(c_i) / group), c_o])
            if group == 1:
                # This is the common-case. Convolve the input without any further complications.
                output = convolve(input, kernel)
                # Split the input into groups and then convolve each of them independently
                input_groups = tf.split(3, group, input)
                kernel_groups = tf.split(3, group, kernel)
                output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)]
                # Concatenate the groups
                output = tf.concat(3, output_groups)
            # Add the biases
            if biased:
                biases = self.make_var('biases', [c_o])
                output = tf.nn.bias_add(output, biases)
            if relu:
                # ReLU non-linearity
                output = tf.nn.relu(output, name=scope.name)
            return output

    def relu(self, input, name):
        return tf.nn.relu(input, name=name)

    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
        return tf.nn.max_pool(input,
                              ksize=[1, k_h, k_w, 1],
                              strides=[1, s_h, s_w, 1],

    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
        return tf.nn.avg_pool(input,
                              ksize=[1, k_h, k_w, 1],
                              strides=[1, s_h, s_w, 1],

    def lrn(self, input, radius, alpha, beta, name, bias=1.0):
        return tf.nn.local_response_normalization(input,

    def concat(self, inputs, axis, name):
        return tf.concat(values=inputs, axis=axis, name=name)

    def add(self, inputs, name):
        return tf.add_n(inputs, name=name)

    def fc(self, input, num_out, name, relu=True):
        with tf.variable_scope(name) as scope:
            input_shape = input.get_shape()
            if input_shape.ndims == 4:
                # The input is spatial. Vectorize it first.
                dim = 1
                for d in input_shape[1:].as_list():
                    dim *= d
                feed_in = tf.reshape(input, [-1, dim])
                feed_in, dim = (input, input_shape[-1].value)
            weights = self.make_var('weights', shape=[dim, num_out])
            biases = self.make_var('biases', [num_out])
            op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b
            fc = op(feed_in, weights, biases, name=scope.name)
            return fc

    def softmax(self, input, name):
        input_shape = map(lambda v: v.value, input.get_shape())
        if len(input_shape) > 2:
            # For certain models (like NiN), the singleton spatial dimensions
            # need to be explicitly squeezed, since they're not broadcast-able
            # in TensorFlow's NHWC ordering (unlike Caffe's NCHW).
            if input_shape[1] == 1 and input_shape[2] == 1:
                input = tf.squeeze(input, squeeze_dims=[1, 2])
                raise ValueError('Rank 2 tensor input expected for softmax!')
        return tf.nn.softmax(input, name)

    def batch_normalization(self, input, name, scale_offset=True, relu=False):
        # NOTE: Currently, only inference is supported
        with tf.variable_scope(name) as scope:
            shape = [input.get_shape()[-1]]
            if scale_offset:
                scale = self.make_var('scale', shape=shape)
                offset = self.make_var('offset', shape=shape)
                scale, offset = (None, None)
            output = tf.nn.batch_normalization(
                mean=self.make_var('mean', shape=shape),
                variance=self.make_var('variance', shape=shape),
                # TODO: This is the default Caffe batch norm eps
                # Get the actual eps from parameters
            if relu:
                output = tf.nn.relu(output)
            return output

    def dropout(self, input, keep_prob, name):
        keep = 1 - self.use_dropout + (self.use_dropout * keep_prob)
        return tf.nn.dropout(input, keep, name=name)
  • 构建骨干网络GoogLeNet:
class GoogLeNet(Network):
    def setup(self):
             .conv(7, 7, 64, 2, 2, name='conv1')
             .max_pool(3, 3, 2, 2, name='pool1')
             .lrn(2, 2e-05, 0.75, name='norm1')
             .conv(1, 1, 64, 1, 1, name='reduction2')
             .conv(3, 3, 192, 1, 1, name='conv2')
             .lrn(2, 2e-05, 0.75, name='norm2')
             .max_pool(3, 3, 2, 2, name='pool2')
             .conv(1, 1, 96, 1, 1, name='icp1_reduction1')
             .conv(3, 3, 128, 1, 1, name='icp1_out1'))

             .conv(1, 1, 16, 1, 1, name='icp1_reduction2')
             .conv(5, 5, 32, 1, 1, name='icp1_out2'))

             .max_pool(3, 3, 1, 1, name='icp1_pool')
             .conv(1, 1, 32, 1, 1, name='icp1_out3'))

             .conv(1, 1, 64, 1, 1, name='icp1_out0'))

             .concat(3, name='icp2_in')
             .conv(1, 1, 128, 1, 1, name='icp2_reduction1')
             .conv(3, 3, 192, 1, 1, name='icp2_out1'))

             .conv(1, 1, 32, 1, 1, name='icp2_reduction2')
             .conv(5, 5, 96, 1, 1, name='icp2_out2'))

             .max_pool(3, 3, 1, 1, name='icp2_pool')
             .conv(1, 1, 64, 1, 1, name='icp2_out3'))

             .conv(1, 1, 128, 1, 1, name='icp2_out0'))

             .concat(3, name='icp2_out')
             .max_pool(3, 3, 2, 2, name='icp3_in')
             .conv(1, 1, 96, 1, 1, name='icp3_reduction1')
             .conv(3, 3, 208, 1, 1, name='icp3_out1'))

             .conv(1, 1, 16, 1, 1, name='icp3_reduction2')
             .conv(5, 5, 48, 1, 1, name='icp3_out2'))

             .max_pool(3, 3, 1, 1, name='icp3_pool')
             .conv(1, 1, 64, 1, 1, name='icp3_out3'))

             .conv(1, 1, 192, 1, 1, name='icp3_out0'))

             .concat(3, name='icp3_out')
             .avg_pool(5, 5, 3, 3, padding='VALID', name='cls1_pool')
             .conv(1, 1, 128, 1, 1, name='cls1_reduction_pose')
             .fc(1024, name='cls1_fc1_pose')
             .fc(3, relu=False, name='cls1_fc_pose_xyz'))

             .fc(4, relu=False, name='cls1_fc_pose_wpqr'))

             .conv(1, 1, 112, 1, 1, name='icp4_reduction1')
             .conv(3, 3, 224, 1, 1, name='icp4_out1'))

             .conv(1, 1, 24, 1, 1, name='icp4_reduction2')
             .conv(5, 5, 64, 1, 1, name='icp4_out2'))

             .max_pool(3, 3, 1, 1, name='icp4_pool')
             .conv(1, 1, 64, 1, 1, name='icp4_out3'))

             .conv(1, 1, 160, 1, 1, name='icp4_out0'))

             .concat(3, name='icp4_out')
             .conv(1, 1, 128, 1, 1, name='icp5_reduction1')
             .conv(3, 3, 256, 1, 1, name='icp5_out1'))

             .conv(1, 1, 24, 1, 1, name='icp5_reduction2')
             .conv(5, 5, 64, 1, 1, name='icp5_out2'))

             .max_pool(3, 3, 1, 1, name='icp5_pool')
             .conv(1, 1, 64, 1, 1, name='icp5_out3'))

             .conv(1, 1, 128, 1, 1, name='icp5_out0'))

             .concat(3, name='icp5_out')
             .conv(1, 1, 144, 1, 1, name='icp6_reduction1')
             .conv(3, 3, 288, 1, 1, name='icp6_out1'))

             .conv(1, 1, 32, 1, 1, name='icp6_reduction2')
             .conv(5, 5, 64, 1, 1, name='icp6_out2'))

             .max_pool(3, 3, 1, 1, name='icp6_pool')
             .conv(1, 1, 64, 1, 1, name='icp6_out3'))

             .conv(1, 1, 112, 1, 1, name='icp6_out0'))

             .concat(3, name='icp6_out')
             .avg_pool(5, 5, 3, 3, padding='VALID', name='cls2_pool')
             .conv(1, 1, 128, 1, 1, name='cls2_reduction_pose')
             .fc(1024, name='cls2_fc1')
             .fc(3, relu=False, name='cls2_fc_pose_xyz'))

             .fc(4, relu=False, name='cls2_fc_pose_wpqr'))

             .conv(1, 1, 160, 1, 1, name='icp7_reduction1')
             .conv(3, 3, 320, 1, 1, name='icp7_out1'))

             .conv(1, 1, 32, 1, 1, name='icp7_reduction2')
             .conv(5, 5, 128, 1, 1, name='icp7_out2'))

             .max_pool(3, 3, 1, 1, name='icp7_pool')
             .conv(1, 1, 128, 1, 1, name='icp7_out3'))

             .conv(1, 1, 256, 1, 1, name='icp7_out0'))

             .concat(3, name='icp7_out')
             .max_pool(3, 3, 2, 2, name='icp8_in')
             .conv(1, 1, 160, 1, 1, name='icp8_reduction1')
             .conv(3, 3, 320, 1, 1, name='icp8_out1'))

             .conv(1, 1, 32, 1, 1, name='icp8_reduction2')
             .conv(5, 5, 128, 1, 1, name='icp8_out2'))

             .max_pool(3, 3, 1, 1, name='icp8_pool')
             .conv(1, 1, 128, 1, 1, name='icp8_out3'))

             .conv(1, 1, 256, 1, 1, name='icp8_out0'))

             .concat(3, name='icp8_out')
             .conv(1, 1, 192, 1, 1, name='icp9_reduction1')
             .conv(3, 3, 384, 1, 1, name='icp9_out1'))

             .conv(1, 1, 48, 1, 1, name='icp9_reduction2')
             .conv(5, 5, 128, 1, 1, name='icp9_out2'))

             .max_pool(3, 3, 1, 1, name='icp9_pool')
             .conv(1, 1, 128, 1, 1, name='icp9_out3'))

             .conv(1, 1, 384, 1, 1, name='icp9_out0'))

             .concat(3, name='icp9_out')
             .avg_pool(7, 7, 1, 1, padding='VALID', name='cls3_pool')
             .fc(2048, name='cls3_fc1_pose')
             .fc(3, relu=False, name='cls3_fc_pose_xyz'))

             .fc(4, relu=False, name='cls3_fc_pose_wpqr'))


  • PoseNet的输入图像是224*224分辨率的,加上本任务对图像尺寸视角等敏感,不适合直接放缩,所以采用中心裁剪的办法,中心裁剪函数如下:
def centeredCrop(img, output_side_length):
    height, width, depth = img.shape
    new_height = output_side_length
    new_width = output_side_length
    if height > width:
        new_height = output_side_length * height / width
        new_width = output_side_length * width / height
    height_offset = (new_height - output_side_length) / 2
    width_offset = (new_width - output_side_length) / 2
    cropped_img = img[height_offset:height_offset + output_side_length,
                        width_offset:width_offset + output_side_length]
    return cropped_img
  • 预处理函数,在这个函数中会调用上面的中心裁剪函数,并对图像的每个通道进行归一化,并完成维度转换,方便送入PyTorch的网络:
def preprocess(images):
    images_out = [] #final result
    #Resize and crop and compute mean!
    images_cropped = []
    for i in tqdm(range(len(images)):
        X = cv2.imread(images[i])
        #X = cv2.resize(X, (455, 256))
        X = centeredCrop(X, 224)
    #compute images mean
    N = 0
    mean = np.zeros((1, 3, 224, 224))
    for X in tqdm(images_cropped):
        X = np.transpose(X,(2,0,1))
        mean[0][0] += X[0,:,:]
        mean[0][1] += X[1,:,:]
        mean[0][2] += X[2,:,:]
        N += 1
    mean[0] /= N
    #Subtract mean from all images
    for X in tqdm(images_cropped):
        X = np.transpose(X,(2,0,1))
        X = X - mean
        X = np.squeeze(X)
        X = np.transpose(X, (1,2,0))
    return images_out
  • 如果调试的时候,为了快速验证,可以不处理全部函数,比如len(images)*0+2


def centeredCrop(img, output_side_length):
    height, width, depth = img.shape
    new_height = output_side_length
    new_width = output_side_length
    if height > width:
        new_height = output_side_length * height / width
        new_width = output_side_length * width / height
    height_offset = (new_height - output_side_length) / 2
    width_offset = (new_width - output_side_length) / 2
    cropped_img = img[height_offset:height_offset + output_side_length,
                        width_offset:width_offset + output_side_length]
    return cropped_img
def preprocess(images):
    images_out = [] #final result
    #Resize and crop and compute mean!
    images_cropped = []
    for i in tqdm(range(len(images)*0+2)):
        X = cv2.imread(images[i])
        #X = cv2.resize(X, (455, 256))
        X = centeredCrop(X, 224)
    #compute images mean
    N = 0
    mean = np.zeros((1, 3, 224, 224))
    for X in tqdm(images_cropped):
        X = np.transpose(X,(2,0,1))
        mean[0][0] += X[0,:,:]
        mean[0][1] += X[1,:,:]
        mean[0][2] += X[2,:,:]
        N += 1
    mean[0] /= N
    #Subtract mean from all images
    for X in tqdm(images_cropped):
        X = np.transpose(X,(2,0,1))
        X = X - mean
        X = np.squeeze(X)
        X = np.transpose(X, (1,2,0))
    return images_out


  • 基本配置设置(my_train_labels是之前文件处理写入的路径)
batch_size = 75
max_iterations = 30000
# Set this path to your dataset directory
my_train_labels = '/kaggle/working/my_train_labels.csv'
  • 创建datasource类,成对存储数据
class datasource(object):
    def __init__(self, images, poses):
        self.images = images
        self.poses = poses
  • 获取单一数据
def gen_data(source):
    while True:
        indices = list(range(len(source.images)))
        for i in indices:
            image = source.images[i]
            pose_x = source.poses[i][0:3]
            pose_q = source.poses[i][3:7]
            yield image, pose_x, pose_q
  • 批量获取数据
def gen_data_batch(source):
    data_gen = gen_data(source)
    while True:
        image_batch = []
        pose_x_batch = []
        pose_q_batch = []
        for _ in range(batch_size):
            image, pose_x, pose_q = next(data_gen)
        yield np.array(image_batch), np.array(pose_x_batch), np.array(pose_q_batch)
  • 获取数据的最终函数:(中间的路径需要根据训练图像所处位置进行更改,按照任务背景的目录结构,是train_labels.csv所处文件夹)
def get_data():
    poses = []
    images = []
    for i in pd.read_csv(my_train_labels).itertuples():
        p0,p1,p2 = i[6].split(';')
        p3,p4,p5,p6 = i[7].split('(')[1].split(')')[0].split(',')
        p0 = float(p0)
        p1 = float(p1)
        p2 = float(p2)
        p3 = float(p3)
        p4 = float(p4)
        p5 = float(p5)
        p6 = float(p6)
        images.append('/kaggle/input/image-matching-challenge-2023/train/' + i[4])
    images = preprocess(images)
    return datasource(images, poses)


images = tf.placeholder(tf.float32, [batch_size, 224, 224, 3])
poses_x = tf.placeholder(tf.float32, [batch_size, 3])
poses_q = tf.placeholder(tf.float32, [batch_size, 4])
datasource = get_data()

net = GoogLeNet({'data': images})

p1_x = net.layers['cls1_fc_pose_xyz']
p1_q = net.layers['cls1_fc_pose_wpqr']
p2_x = net.layers['cls2_fc_pose_xyz']
p2_q = net.layers['cls2_fc_pose_wpqr']
p3_x = net.layers['cls3_fc_pose_xyz']
p3_q = net.layers['cls3_fc_pose_wpqr']

l1_x = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p1_x, poses_x)))) * 0.3
l1_q = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p1_q, poses_q)))) * 150
l2_x = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p2_x, poses_x)))) * 0.3
l2_q = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p2_q, poses_q)))) * 150
l3_x = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p3_x, poses_x)))) * 1
l3_q = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p3_q, poses_q)))) * 500

loss = l1_x + l1_q + l2_x + l2_q + l3_x + l3_q
opt = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999, epsilon=0.00000001, use_locking=False, name='Adam').minimize(loss)

# Set GPU options
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6833)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
outputFile = "PoseNet.ckpt"
  • 这里运行第二遍会报错,因为网络在内存中已经构建起来了


  • 下面的代码cpu、gpu环境都可以,每20轮打印一下损失,每500轮保存一下权重
  • 加载的预训练权重放在了'/kaggle/input/tensorflow-posenet-master/tensorflow-posenet-master/posenet.npy',是官方caffe权重转换过来的,下载链接
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    # Load the data
    net.load('/kaggle/input/tensorflow-posenet-master/tensorflow-posenet-master/posenet.npy', sess)

    data_gen = gen_data_batch(datasource)
    for i in range(max_iterations):
        np_images, np_poses_x, np_poses_q = next(data_gen)
        feed = {images: np_images, poses_x: np_poses_x, poses_q: np_poses_q}

        sess.run(opt, feed_dict=feed)
        np_loss = sess.run(loss, feed_dict=feed)
        if i % 20 == 0:
            print("iteration: " + str(i) + "\n\t" + "Loss is: " + str(np_loss))
        if i % 500 == 0:
            saver.save(sess, outputFile)
            print("Intermediate file saved at: " + outputFile)
    saver.save(sess, outputFile)
    print("Intermediate file saved at: " + outputFile)
  • 效果如下:
    在这里插入图片描述## 9.完整代码如下
  • 完整notebook下载链接
  • tensorflow权重文件下载地址
  • 完整代码如下:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.compat.v1 as tf
import random
import cv2
from tqdm import tqdm

change_train_labels = 1
def matrix2quaternion(m):
    w = ((np.trace(m) + 1) ** 0.5) / 2
    x = (m[2][1] - m[1][2]) / (4 * w)
    y = (m[0][2] - m[2][0]) / (4 * w)
    z = (m[1][0] - m[0][1]) / (4 * w)
    return w,x,y,z
def quaternion2matrix(q):
    w,x,y,z = q
    return np.array([[1-2*y*y-2*z*z, 2*x*y-2*z*w, 2*x*z+2*y*w],
             [2*x*y+2*z*w, 1-2*x*x-2*z*z, 2*y*z-2*x*w],
             [2*x*z-2*y*w, 2*y*z+2*x*w, 1-2*x*x-2*y*y]])

def m(a):
    a = a.split(';')
    a = [float(i) for i in a]
    A = np.array([[a[0],a[1],a[2]],
    return matrix2quaternion(A)

if change_train_labels:
    train_labels = pd.read_csv('/kaggle/input/image-matching-challenge-2023/train/train_labels.csv')
    train_labels['rotation_matrix_quaternion'] = [i for i in map(m,train_labels['rotation_matrix'])]


def layer(op):
    '''Decorator for composable network layers.'''

    def layer_decorated(self, *args, **kwargs):
        # Automatically set a name if not vided.
        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
        # Figure out the layer inputs.
        if len(self.terminals) == 0:
            raise RuntimeError('No input variables found for layer %s.' % name)
        elif len(self.terminals) == 1:
            layer_input = self.terminals[0]
            layer_input = list(self.terminals)
        # Perform the operation and get the output.
        layer_output = op(self, layer_input, *args, **kwargs)
        # Add to layer LUT.
        self.layers[name] = layer_output
        # This output is now the input for the next layer.
        # Return self for chained calls.
        return self

    return layer_decorated

class Network(object):

    def __init__(self, inputs, trainable=True):
        # The input nodes for this network
        self.inputs = inputs
        # The current list of terminal nodes
        self.terminals = []
        # Mapping from layer names to layers
        self.layers = dict(inputs)
        # If true, the resulting variables are set as trainable
        self.trainable = trainable
        # Switch variable for dropout
        self.use_dropout = tf.placeholder_with_default(tf.constant(1.0),

    def setup(self):
        '''Construct the network. '''
        raise NotImplementedError('Must be implemented by the subclass.')

    def load(self, data_path, session, ignore_missing=False):
        '''Load network weights.
        data_path: The path to the numpy-serialized network weights
        session: The current TensorFlow session
        ignore_missing: If true, serialized weights for missing layers are ignored.
        data_dict = np.load(data_path,allow_pickle=True,encoding="latin1").item()
        for op_name in data_dict:
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in data_dict[op_name].items():
                        var = tf.get_variable(param_name)
                    except ValueError:
                        if not ignore_missing:

    def feed(self, *args):
        '''Set the input(s) for the next operation by replacing the terminal nodes.
        The arguments can be either layer names or the actual layers.
        assert len(args) != 0
        self.terminals = []
        for fed_layer in args:
            if isinstance(fed_layer, str):
                    fed_layer = self.layers[fed_layer]
                except KeyError:
                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
        return self

    def get_output(self):
        '''Returns the current network output.'''
        return self.terminals[-1]

    def get_unique_name(self, prefix):
        '''Returns an index-suffixed unique name for the given prefix.
        This is used for auto-generating layer names based on the type-prefix.
        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
        return '%s_%d' % (prefix, ident)

    def make_var(self, name, shape):
        '''Creates a new TensorFlow variable.'''
        return tf.get_variable(name, shape, trainable=self.trainable)

    def validate_padding(self, padding):
        '''Verifies that the padding is one of the supported ones.'''
        assert padding in ('SAME', 'VALID')

    def conv(self,
        # Verify that the padding is acceptable
        # Get the number of channels in the input
        c_i = input.get_shape()[-1]
        # Verify that the grouping parameter is valid
        assert c_i % group == 0
        assert c_o % group == 0
        # Convolution for a given input and kernel
        convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
        with tf.variable_scope(name) as scope:
            kernel = self.make_var('weights', shape=[k_h, k_w, int(int(c_i) / group), c_o])
            if group == 1:
                # This is the common-case. Convolve the input without any further complications.
                output = convolve(input, kernel)
                # Split the input into groups and then convolve each of them independently
                input_groups = tf.split(3, group, input)
                kernel_groups = tf.split(3, group, kernel)
                output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)]
                # Concatenate the groups
                output = tf.concat(3, output_groups)
            # Add the biases
            if biased:
                biases = self.make_var('biases', [c_o])
                output = tf.nn.bias_add(output, biases)
            if relu:
                # ReLU non-linearity
                output = tf.nn.relu(output, name=scope.name)
            return output

    def relu(self, input, name):
        return tf.nn.relu(input, name=name)

    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
        return tf.nn.max_pool(input,
                              ksize=[1, k_h, k_w, 1],
                              strides=[1, s_h, s_w, 1],

    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
        return tf.nn.avg_pool(input,
                              ksize=[1, k_h, k_w, 1],
                              strides=[1, s_h, s_w, 1],

    def lrn(self, input, radius, alpha, beta, name, bias=1.0):
        return tf.nn.local_response_normalization(input,

    def concat(self, inputs, axis, name):
        return tf.concat(values=inputs, axis=axis, name=name)

    def add(self, inputs, name):
        return tf.add_n(inputs, name=name)

    def fc(self, input, num_out, name, relu=True):
        with tf.variable_scope(name) as scope:
            input_shape = input.get_shape()
            if input_shape.ndims == 4:
                # The input is spatial. Vectorize it first.
                dim = 1
                for d in input_shape[1:].as_list():
                    dim *= d
                feed_in = tf.reshape(input, [-1, dim])
                feed_in, dim = (input, input_shape[-1].value)
            weights = self.make_var('weights', shape=[dim, num_out])
            biases = self.make_var('biases', [num_out])
            op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b
            fc = op(feed_in, weights, biases, name=scope.name)
            return fc

    def softmax(self, input, name):
        input_shape = map(lambda v: v.value, input.get_shape())
        if len(input_shape) > 2:
            # For certain models (like NiN), the singleton spatial dimensions
            # need to be explicitly squeezed, since they're not broadcast-able
            # in TensorFlow's NHWC ordering (unlike Caffe's NCHW).
            if input_shape[1] == 1 and input_shape[2] == 1:
                input = tf.squeeze(input, squeeze_dims=[1, 2])
                raise ValueError('Rank 2 tensor input expected for softmax!')
        return tf.nn.softmax(input, name)

    def batch_normalization(self, input, name, scale_offset=True, relu=False):
        # NOTE: Currently, only inference is supported
        with tf.variable_scope(name) as scope:
            shape = [input.get_shape()[-1]]
            if scale_offset:
                scale = self.make_var('scale', shape=shape)
                offset = self.make_var('offset', shape=shape)
                scale, offset = (None, None)
            output = tf.nn.batch_normalization(
                mean=self.make_var('mean', shape=shape),
                variance=self.make_var('variance', shape=shape),
                # TODO: This is the default Caffe batch norm eps
                # Get the actual eps from parameters
            if relu:
                output = tf.nn.relu(output)
            return output

    def dropout(self, input, keep_prob, name):
        keep = 1 - self.use_dropout + (self.use_dropout * keep_prob)
        return tf.nn.dropout(input, keep, name=name)

def centeredCrop(img, output_side_length):
    height, width, depth = img.shape
    new_height = output_side_length
    new_width = output_side_length
    if height > width:
        new_height = output_side_length * height / width
        new_width = output_side_length * width / height
    height_offset = (new_height - output_side_length) / 2
    width_offset = (new_width - output_side_length) / 2
    cropped_img = img[height_offset:height_offset + output_side_length,
                        width_offset:width_offset + output_side_length]
    return cropped_img
def preprocess(images):
    images_out = [] #final result
    #Resize and crop and compute mean!
    images_cropped = []
    for i in tqdm(range(len(images))):
        X = cv2.imread(images[i])
        #X = cv2.resize(X, (455, 256))
        X = centeredCrop(X, 224)
    #compute images mean
    N = 0
    mean = np.zeros((1, 3, 224, 224))
    for X in tqdm(images_cropped):
        X = np.transpose(X,(2,0,1))
        mean[0][0] += X[0,:,:]
        mean[0][1] += X[1,:,:]
        mean[0][2] += X[2,:,:]
        N += 1
    mean[0] /= N
    #Subtract mean from all images
    for X in tqdm(images_cropped):
        X = np.transpose(X,(2,0,1))
        X = X - mean
        X = np.squeeze(X)
        X = np.transpose(X, (1,2,0))
    return images_out

class GoogLeNet(Network):
    def setup(self):
             .conv(7, 7, 64, 2, 2, name='conv1')
             .max_pool(3, 3, 2, 2, name='pool1')
             .lrn(2, 2e-05, 0.75, name='norm1')
             .conv(1, 1, 64, 1, 1, name='reduction2')
             .conv(3, 3, 192, 1, 1, name='conv2')
             .lrn(2, 2e-05, 0.75, name='norm2')
             .max_pool(3, 3, 2, 2, name='pool2')
             .conv(1, 1, 96, 1, 1, name='icp1_reduction1')
             .conv(3, 3, 128, 1, 1, name='icp1_out1'))

             .conv(1, 1, 16, 1, 1, name='icp1_reduction2')
             .conv(5, 5, 32, 1, 1, name='icp1_out2'))

             .max_pool(3, 3, 1, 1, name='icp1_pool')
             .conv(1, 1, 32, 1, 1, name='icp1_out3'))

             .conv(1, 1, 64, 1, 1, name='icp1_out0'))

             .concat(3, name='icp2_in')
             .conv(1, 1, 128, 1, 1, name='icp2_reduction1')
             .conv(3, 3, 192, 1, 1, name='icp2_out1'))

             .conv(1, 1, 32, 1, 1, name='icp2_reduction2')
             .conv(5, 5, 96, 1, 1, name='icp2_out2'))

             .max_pool(3, 3, 1, 1, name='icp2_pool')
             .conv(1, 1, 64, 1, 1, name='icp2_out3'))

             .conv(1, 1, 128, 1, 1, name='icp2_out0'))

             .concat(3, name='icp2_out')
             .max_pool(3, 3, 2, 2, name='icp3_in')
             .conv(1, 1, 96, 1, 1, name='icp3_reduction1')
             .conv(3, 3, 208, 1, 1, name='icp3_out1'))

             .conv(1, 1, 16, 1, 1, name='icp3_reduction2')
             .conv(5, 5, 48, 1, 1, name='icp3_out2'))

             .max_pool(3, 3, 1, 1, name='icp3_pool')
             .conv(1, 1, 64, 1, 1, name='icp3_out3'))

             .conv(1, 1, 192, 1, 1, name='icp3_out0'))

             .concat(3, name='icp3_out')
             .avg_pool(5, 5, 3, 3, padding='VALID', name='cls1_pool')
             .conv(1, 1, 128, 1, 1, name='cls1_reduction_pose')
             .fc(1024, name='cls1_fc1_pose')
             .fc(3, relu=False, name='cls1_fc_pose_xyz'))

             .fc(4, relu=False, name='cls1_fc_pose_wpqr'))

             .conv(1, 1, 112, 1, 1, name='icp4_reduction1')
             .conv(3, 3, 224, 1, 1, name='icp4_out1'))

             .conv(1, 1, 24, 1, 1, name='icp4_reduction2')
             .conv(5, 5, 64, 1, 1, name='icp4_out2'))

             .max_pool(3, 3, 1, 1, name='icp4_pool')
             .conv(1, 1, 64, 1, 1, name='icp4_out3'))

             .conv(1, 1, 160, 1, 1, name='icp4_out0'))

             .concat(3, name='icp4_out')
             .conv(1, 1, 128, 1, 1, name='icp5_reduction1')
             .conv(3, 3, 256, 1, 1, name='icp5_out1'))

             .conv(1, 1, 24, 1, 1, name='icp5_reduction2')
             .conv(5, 5, 64, 1, 1, name='icp5_out2'))

             .max_pool(3, 3, 1, 1, name='icp5_pool')
             .conv(1, 1, 64, 1, 1, name='icp5_out3'))

             .conv(1, 1, 128, 1, 1, name='icp5_out0'))

             .concat(3, name='icp5_out')
             .conv(1, 1, 144, 1, 1, name='icp6_reduction1')
             .conv(3, 3, 288, 1, 1, name='icp6_out1'))

             .conv(1, 1, 32, 1, 1, name='icp6_reduction2')
             .conv(5, 5, 64, 1, 1, name='icp6_out2'))

             .max_pool(3, 3, 1, 1, name='icp6_pool')
             .conv(1, 1, 64, 1, 1, name='icp6_out3'))

             .conv(1, 1, 112, 1, 1, name='icp6_out0'))

             .concat(3, name='icp6_out')
             .avg_pool(5, 5, 3, 3, padding='VALID', name='cls2_pool')
             .conv(1, 1, 128, 1, 1, name='cls2_reduction_pose')
             .fc(1024, name='cls2_fc1')
             .fc(3, relu=False, name='cls2_fc_pose_xyz'))

             .fc(4, relu=False, name='cls2_fc_pose_wpqr'))

             .conv(1, 1, 160, 1, 1, name='icp7_reduction1')
             .conv(3, 3, 320, 1, 1, name='icp7_out1'))

             .conv(1, 1, 32, 1, 1, name='icp7_reduction2')
             .conv(5, 5, 128, 1, 1, name='icp7_out2'))

             .max_pool(3, 3, 1, 1, name='icp7_pool')
             .conv(1, 1, 128, 1, 1, name='icp7_out3'))

             .conv(1, 1, 256, 1, 1, name='icp7_out0'))

             .concat(3, name='icp7_out')
             .max_pool(3, 3, 2, 2, name='icp8_in')
             .conv(1, 1, 160, 1, 1, name='icp8_reduction1')
             .conv(3, 3, 320, 1, 1, name='icp8_out1'))

             .conv(1, 1, 32, 1, 1, name='icp8_reduction2')
             .conv(5, 5, 128, 1, 1, name='icp8_out2'))

             .max_pool(3, 3, 1, 1, name='icp8_pool')
             .conv(1, 1, 128, 1, 1, name='icp8_out3'))

             .conv(1, 1, 256, 1, 1, name='icp8_out0'))

             .concat(3, name='icp8_out')
             .conv(1, 1, 192, 1, 1, name='icp9_reduction1')
             .conv(3, 3, 384, 1, 1, name='icp9_out1'))

             .conv(1, 1, 48, 1, 1, name='icp9_reduction2')
             .conv(5, 5, 128, 1, 1, name='icp9_out2'))

             .max_pool(3, 3, 1, 1, name='icp9_pool')
             .conv(1, 1, 128, 1, 1, name='icp9_out3'))

             .conv(1, 1, 384, 1, 1, name='icp9_out0'))

             .concat(3, name='icp9_out')
             .avg_pool(7, 7, 1, 1, padding='VALID', name='cls3_pool')
             .fc(2048, name='cls3_fc1_pose')
             .fc(3, relu=False, name='cls3_fc_pose_xyz'))

             .fc(4, relu=False, name='cls3_fc_pose_wpqr'))

batch_size = 75
max_iterations = 30000
# Set this path to your dataset directory
directory = 'path_to_datasets/KingsCollege/'
dataset = 'dataset_train.txt'
my_train_labels = '/kaggle/working/my_train_labels.csv'

class datasource(object):
    def __init__(self, images, poses):
        self.images = images
        self.poses = poses

def centeredCrop(img, output_side_length):
    height, width, depth = img.shape
    new_height = output_side_length
    new_width = output_side_length
    if height > width:
        new_height = output_side_length * height / width
        new_width = output_side_length * width / height
    height_offset = (new_height - output_side_length) / 2
    width_offset = (new_width - output_side_length) / 2
    cropped_img = img[int(height_offset):int(height_offset + output_side_length),
                        int(width_offset):int(width_offset + output_side_length)]
    return cropped_img

def gen_data(source):
    while True:
        indices = list(range(len(source.images)))
        for i in indices:
            image = source.images[i]
            pose_x = source.poses[i][0:3]
            pose_q = source.poses[i][3:7]
            yield image, pose_x, pose_q

def gen_data_batch(source):
    data_gen = gen_data(source)
    while True:
        image_batch = []
        pose_x_batch = []
        pose_q_batch = []
        for _ in range(batch_size):
            image, pose_x, pose_q = next(data_gen)
        yield np.array(image_batch), np.array(pose_x_batch), np.array(pose_q_batch)

def get_data():
    poses = []
    images = []
    for i in pd.read_csv(my_train_labels).itertuples():
        p0,p1,p2 = i[6].split(';')
        p3,p4,p5,p6 = i[7].split('(')[1].split(')')[0].split(',')
        p0 = float(p0)
        p1 = float(p1)
        p2 = float(p2)
        p3 = float(p3)
        p4 = float(p4)
        p5 = float(p5)
        p6 = float(p6)
        images.append('/kaggle/input/image-matching-challenge-2023/train/' + i[4])
    images = preprocess(images)
    return datasource(images, poses)

images = tf.placeholder(tf.float32, [batch_size, 224, 224, 3])
poses_x = tf.placeholder(tf.float32, [batch_size, 3])
poses_q = tf.placeholder(tf.float32, [batch_size, 4])
datasource = get_data()

net = GoogLeNet({'data': images})

p1_x = net.layers['cls1_fc_pose_xyz']
p1_q = net.layers['cls1_fc_pose_wpqr']
p2_x = net.layers['cls2_fc_pose_xyz']
p2_q = net.layers['cls2_fc_pose_wpqr']
p3_x = net.layers['cls3_fc_pose_xyz']
p3_q = net.layers['cls3_fc_pose_wpqr']

l1_x = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p1_x, poses_x)))) * 0.3
l1_q = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p1_q, poses_q)))) * 150
l2_x = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p2_x, poses_x)))) * 0.3
l2_q = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p2_q, poses_q)))) * 150
l3_x = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p3_x, poses_x)))) * 1
l3_q = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(p3_q, poses_q)))) * 500

loss = l1_x + l1_q + l2_x + l2_q + l3_x + l3_q
opt = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999, epsilon=0.00000001, use_locking=False, name='Adam').minimize(loss)

# Set GPU options
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6833)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
outputFile = "PoseNet.ckpt"

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    # Load the data
    net.load('/kaggle/input/tensorflow-posenet-master/tensorflow-posenet-master/posenet.npy', sess)

    data_gen = gen_data_batch(datasource)
    for i in range(max_iterations):
        np_images, np_poses_x, np_poses_q = next(data_gen)
        feed = {images: np_images, poses_x: np_poses_x, poses_q: np_poses_q}

        sess.run(opt, feed_dict=feed)
        np_loss = sess.run(loss, feed_dict=feed)
        if i % 20 == 0:
            print("iteration: " + str(i) + "\n\t" + "Loss is: " + str(np_loss))
        if i % 500 == 0:
            saver.save(sess, outputFile)
            print("Intermediate file saved at: " + outputFile)
    saver.save(sess, outputFile)
    print("Intermediate file saved at: " + outputFile)


