训练

记录一下基于Docker的TensorFlow Serving

实验环境为:

训练环境:AWS SageMaker Training Jobs

推理环境:AWS SageMaker Notebook

文件包括为:

make_container.sh

container/Dockerfile

container/code/train

Dockerfile内容为

FROM ubuntu:latest

RUN apt-get update \
  && apt-get install -y python3-pip python3-dev \
  && cd /usr/local/bin \
  && ln -s /usr/bin/python3 python \
  && pip3 install --upgrade pip

COPY code /opt/program
WORKDIR /opt/program

ENV PATH="/opt/program:${PATH}"
ENV PATH="/usr/local:${PATH}"
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8

RUN pip install boto3 tensorflow cos-python-sdk-v5

RUN chmod +x train
ENV AWS_DEFAULT_REGION=region-name
ENV AWS_ACCESS_KEY_ID=AK
ENV AWS_SECRET_ACCESS_KEY=SK

make_container.sh的内容为:

make_container.sh的任务包括:本地构建镜像;推送到ECR

# The name of our algorithm
algorithm_name=$1

cd container

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

train的内容为:

注意train是没有.py结尾的,所以需要在前面注释#!/usr/bin/env python并在docker中加上chmod +x train命令

#!/usr/bin/env python
import os
import sys
from importlib import reload
import boto3

prefix = '/opt/ml/'

input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

# This algorithm has a single channel of input data called 'training'. Since we run in
# File mode, the input files are copied to the directory specified here.
channel_name = 'training'
training_path = os.path.join(input_path, channel_name)


# The function to execute the training.
def train():
    print('Starting the training.')

    sys.path.append('/usr/local/bin')

    s3 = boto3.resource('s3')
    bucket = s3.Bucket('bucket_name')
    print('Downloading')
    for obj in bucket.objects.filter(Prefix='prefix_name'):
        print(obj)
        if obj.key.endswith('/'):
            continue
        if not os.path.exists(os.path.dirname('/usr/local/bin/' + obj.key)):
            os.makedirs(os.path.dirname('/usr/local/bin/' + obj.key))
        bucket.download_file(obj.key, '/usr/local/bin/' + obj.key)  # save to same path

    from train_code import train_runner
    train_runner.run()

    print('Training complete.')


if __name__ == '__main__':
    train()

    # A zero exit code causes the job to be marked a Succeeded.
    sys.exit(0)

train会从bucket下载一个train_runner.py文件,train_runner.py文件的内容为:

#!/usr/bin/env python

import sys
assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'

import tensorflow as tf
from tensorflow import keras

import numpy as np
import os
import subprocess

import tempfile

import boto3
s3 = boto3.client('s3')



def train_and_save():

    fashion_mnist = keras.datasets.fashion_mnist
    (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

    # scale the values to 0.0 to 1.0
    train_images = train_images / 255.0
    test_images = test_images / 255.0

    # reshape for feeding into the model
    train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
    test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)

    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    print('\ntrain_images.shape: {}, of {}'.format(train_images.shape, train_images.dtype))
    print('test_images.shape: {}, of {}'.format(test_images.shape, test_images.dtype))

    model = keras.Sequential([
        keras.layers.Conv2D(input_shape=(28, 28, 1), filters=8, kernel_size=3,
                            strides=2, activation='relu', name='Conv1'),
        keras.layers.Flatten(),
        keras.layers.Dense(10, activation=tf.nn.softmax, name='Softmax')
    ])
    model.summary()

    testing = False
    epochs = 5

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(train_images, train_labels, epochs=epochs)

    test_loss, test_acc = model.evaluate(test_images, test_labels)
    print('\nTest accuracy: {}'.format(test_acc))

    MODEL_DIR = tempfile.gettempdir()
    version = 1
    export_path = os.path.join(MODEL_DIR, str(version))
    print('export_path = {}\n'.format(export_path))

    tf.keras.models.save_model(
        model,
        export_path,
        overwrite=True,
        include_optimizer=True,
        save_format=None,
        signatures=None,
        options=None
    )

def uploadDirectory(path, no_use_path, bucket, prefix):
    assert no_use_path.endswith('/')
    for root, dirs, files in os.walk(path):
        for file in files:
            full_path = os.path.join(root, file)

            key = os.path.join(prefix, full_path.replace(no_use_path, ''))
            with open(full_path, "rb") as f:
                s3.upload_fileobj(f, bucket, key)

def run():
      #训练并保存模型到/tmp/1
    train_and_save()
    #把tmp/1上的内容存储到S3中
    uploadDirectory('/tmp/1/', '/tmp/1/', 'bucket_name', 'prefix_name/')

下面就可以,启动Training Jobs了

import sagemaker as sage
# 构建Estimator,包括镜像,Execution Role, 输出的模型位置:buket/prefix/training_id/output, session

#训练集所在的s3地址
data_location = 's3://bucket_name/prefix_name/'
#docker镜像地址
image = 'user_id.dkr.ecr.region-id.amazonaws.com/image-name:latest'
#输出地址
output = 's3://bucket_name/prefix_name/'
#TrainingJobRole
role = 'arn:aws:iam::337058716437:role/RoleName'

def train(data_location, image, output_path, role):
    
    sess = sage.Session()
    
    model = sage.estimator.Estimator(image,
                           role, 1, 'ml.c4.2xlarge',
                           output_path=output_path,
                           sagemaker_session=sess,
                                    )
    model.fit({'train': data_location})

train(data_location, image, output, role)

最后会有一个/tmp/1文件夹,并且会把这个文件夹上传到s3,

1文件夹中包括:

saved_model.pb

varialbles/variables.data-00000-of-00001

variables/variables.index

Serving

Serving的Dockerfile内容为:

默认当前目录中有一个1文件夹,里面包含模型文件

FROM ubuntu:18.04

#
# Reference: https://github.com/tensorflow/serving/issues/819
#

# Install general packages
RUN apt-get update && apt-get install -y \
        curl \
        libcurl3-dev \
        unzip \
        wget \
        python3-dev \
        && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

# Previous Installation of tensorflow-model-server (BROKEN RECENTLY)
#RUN echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list \
#    && curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add - \
#    && apt-get update && apt-get install tensorflow-model-server

# New installation of tensorflow-model-server
RUN TEMP_DEB="$(mktemp)" \
    && wget -O "$TEMP_DEB" 'http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-2.3.0/t/tensorflow-model-server/tensorflow-model-server_2.3.0_all.deb' \
    && dpkg -i "$TEMP_DEB" \
    && rm -f "$TEMP_DEB"

# gRPC port
EXPOSE 9000
# REST API port
EXPOSE 9001

COPY 1 /model/1

之后,构建docker镜像:

docker build -t serve .

接下来,就可以启动docker了

做一个端口映射,-p 9000:9000把docker内部的9000映射到外面

docker run -it -p 9000:9000 servegpu

最后,在docker里面把模型serve起来

tensorflow_model_server --rest_api_port=9000 --model_name=fashion_model --model_base_path=/model

现在可以在外面来访问docker了:

from tensorflow import keras
import requests
import json
import numpy as np
import matplotlib.pyplot as plt

def show(idx, title):
    plt.figure()
    plt.imshow(test_images[idx].reshape(28,28))
    plt.axis('off')
    plt.title('\n\n{}'.format(title), fontdict={'size': 16})

fashion_mnist = keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

# scale the values to 0.0 to 1.0
train_images = train_images / 255.0
test_images = test_images / 255.0

# reshape for feeding into the model
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

print('\ntrain_images.shape: {}, of {}'.format(train_images.shape, train_images.dtype))
print('test_images.shape: {}, of {}'.format(test_images.shape, test_images.dtype))

data = json.dumps({"signature_name": "serving_default", "instances": test_images[0:3].tolist()})
print('Data: {} ... {}'.format(data[:50], data[len(data)-52:]))

headers = {"content-type": "application/json"}
json_response = requests.post('http://localhost:9000/v1/models/fashion_model:predict', data=data, headers=headers)
predictions = json.loads(json_response.text)['predictions']

show(0, 'The model thought this was a {} (class {}), and it was actually a {} (class {})'.format(
  class_names[np.argmax(predictions[0])], np.argmax(predictions[0]), class_names[test_labels[0]], test_labels[0]))




最后修改:2021 年 08 月 10 日 11 : 29 AM
如果觉得我的文章对你有用,请随意赞赏