训练
记录一下基于Docker的TensorFlow Serving
实验环境为:
训练环境:AWS SageMaker Training Jobs
推理环境:AWS SageMaker Notebook
文件包括为:
make_container.sh
container/Dockerfile
container/code/train
Dockerfile内容为
FROM ubuntu:latest
RUN apt-get update \
&& apt-get install -y python3-pip python3-dev \
&& cd /usr/local/bin \
&& ln -s /usr/bin/python3 python \
&& pip3 install --upgrade pip
COPY code /opt/program
WORKDIR /opt/program
ENV PATH="/opt/program:${PATH}"
ENV PATH="/usr/local:${PATH}"
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
RUN pip install boto3 tensorflow cos-python-sdk-v5
RUN chmod +x train
ENV AWS_DEFAULT_REGION=region-name
ENV AWS_ACCESS_KEY_ID=AK
ENV AWS_SECRET_ACCESS_KEY=SK
make_container.sh的内容为:
make_container.sh的任务包括:本地构建镜像;推送到ECR
# The name of our algorithm
algorithm_name=$1
cd container
account=$(aws sts get-caller-identity --query Account --output text)
# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}
fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"
# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi
# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}
# Build the docker image locally with the image name and then push it to ECR
# with the full name.
docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}
docker push ${fullname}
train的内容为:
注意train是没有.py结尾的,所以需要在前面注释
#!/usr/bin/env python
并在docker中加上chmod +x train
命令
#!/usr/bin/env python
import os
import sys
from importlib import reload
import boto3
prefix = '/opt/ml/'
input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
# This algorithm has a single channel of input data called 'training'. Since we run in
# File mode, the input files are copied to the directory specified here.
channel_name = 'training'
training_path = os.path.join(input_path, channel_name)
# The function to execute the training.
def train():
print('Starting the training.')
sys.path.append('/usr/local/bin')
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket_name')
print('Downloading')
for obj in bucket.objects.filter(Prefix='prefix_name'):
print(obj)
if obj.key.endswith('/'):
continue
if not os.path.exists(os.path.dirname('/usr/local/bin/' + obj.key)):
os.makedirs(os.path.dirname('/usr/local/bin/' + obj.key))
bucket.download_file(obj.key, '/usr/local/bin/' + obj.key) # save to same path
from train_code import train_runner
train_runner.run()
print('Training complete.')
if __name__ == '__main__':
train()
# A zero exit code causes the job to be marked a Succeeded.
sys.exit(0)
train会从bucket下载一个train_runner.py文件,train_runner.py文件的内容为:
#!/usr/bin/env python
import sys
assert sys.version_info.major is 3, 'Oops, not running Python 3. Use Runtime > Change runtime type'
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import subprocess
import tempfile
import boto3
s3 = boto3.client('s3')
def train_and_save():
fashion_mnist = keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
# scale the values to 0.0 to 1.0
train_images = train_images / 255.0
test_images = test_images / 255.0
# reshape for feeding into the model
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
print('\ntrain_images.shape: {}, of {}'.format(train_images.shape, train_images.dtype))
print('test_images.shape: {}, of {}'.format(test_images.shape, test_images.dtype))
model = keras.Sequential([
keras.layers.Conv2D(input_shape=(28, 28, 1), filters=8, kernel_size=3,
strides=2, activation='relu', name='Conv1'),
keras.layers.Flatten(),
keras.layers.Dense(10, activation=tf.nn.softmax, name='Softmax')
])
model.summary()
testing = False
epochs = 5
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(train_images, train_labels, epochs=epochs)
test_loss, test_acc = model.evaluate(test_images, test_labels)
print('\nTest accuracy: {}'.format(test_acc))
MODEL_DIR = tempfile.gettempdir()
version = 1
export_path = os.path.join(MODEL_DIR, str(version))
print('export_path = {}\n'.format(export_path))
tf.keras.models.save_model(
model,
export_path,
overwrite=True,
include_optimizer=True,
save_format=None,
signatures=None,
options=None
)
def uploadDirectory(path, no_use_path, bucket, prefix):
assert no_use_path.endswith('/')
for root, dirs, files in os.walk(path):
for file in files:
full_path = os.path.join(root, file)
key = os.path.join(prefix, full_path.replace(no_use_path, ''))
with open(full_path, "rb") as f:
s3.upload_fileobj(f, bucket, key)
def run():
#训练并保存模型到/tmp/1
train_and_save()
#把tmp/1上的内容存储到S3中
uploadDirectory('/tmp/1/', '/tmp/1/', 'bucket_name', 'prefix_name/')
下面就可以,启动Training Jobs了
import sagemaker as sage
# 构建Estimator,包括镜像,Execution Role, 输出的模型位置:buket/prefix/training_id/output, session
#训练集所在的s3地址
data_location = 's3://bucket_name/prefix_name/'
#docker镜像地址
image = 'user_id.dkr.ecr.region-id.amazonaws.com/image-name:latest'
#输出地址
output = 's3://bucket_name/prefix_name/'
#TrainingJobRole
role = 'arn:aws:iam::337058716437:role/RoleName'
def train(data_location, image, output_path, role):
sess = sage.Session()
model = sage.estimator.Estimator(image,
role, 1, 'ml.c4.2xlarge',
output_path=output_path,
sagemaker_session=sess,
)
model.fit({'train': data_location})
train(data_location, image, output, role)
最后会有一个/tmp/1文件夹,并且会把这个文件夹上传到s3,
1文件夹中包括:
saved_model.pb
varialbles/variables.data-00000-of-00001
variables/variables.index
Serving
Serving的Dockerfile内容为:
默认当前目录中有一个
1
文件夹,里面包含模型文件
FROM ubuntu:18.04
#
# Reference: https://github.com/tensorflow/serving/issues/819
#
# Install general packages
RUN apt-get update && apt-get install -y \
curl \
libcurl3-dev \
unzip \
wget \
python3-dev \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Previous Installation of tensorflow-model-server (BROKEN RECENTLY)
#RUN echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list \
# && curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add - \
# && apt-get update && apt-get install tensorflow-model-server
# New installation of tensorflow-model-server
RUN TEMP_DEB="$(mktemp)" \
&& wget -O "$TEMP_DEB" 'http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-2.3.0/t/tensorflow-model-server/tensorflow-model-server_2.3.0_all.deb' \
&& dpkg -i "$TEMP_DEB" \
&& rm -f "$TEMP_DEB"
# gRPC port
EXPOSE 9000
# REST API port
EXPOSE 9001
COPY 1 /model/1
之后,构建docker镜像:
docker build -t serve .
接下来,就可以启动docker了
做一个端口映射,
-p 9000:9000
把docker内部的9000映射到外面
docker run -it -p 9000:9000 servegpu
最后,在docker里面把模型serve起来
tensorflow_model_server --rest_api_port=9000 --model_name=fashion_model --model_base_path=/model
现在可以在外面来访问docker了:
from tensorflow import keras
import requests
import json
import numpy as np
import matplotlib.pyplot as plt
def show(idx, title):
plt.figure()
plt.imshow(test_images[idx].reshape(28,28))
plt.axis('off')
plt.title('\n\n{}'.format(title), fontdict={'size': 16})
fashion_mnist = keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
# scale the values to 0.0 to 1.0
train_images = train_images / 255.0
test_images = test_images / 255.0
# reshape for feeding into the model
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
print('\ntrain_images.shape: {}, of {}'.format(train_images.shape, train_images.dtype))
print('test_images.shape: {}, of {}'.format(test_images.shape, test_images.dtype))
data = json.dumps({"signature_name": "serving_default", "instances": test_images[0:3].tolist()})
print('Data: {} ... {}'.format(data[:50], data[len(data)-52:]))
headers = {"content-type": "application/json"}
json_response = requests.post('http://localhost:9000/v1/models/fashion_model:predict', data=data, headers=headers)
predictions = json.loads(json_response.text)['predictions']
show(0, 'The model thought this was a {} (class {}), and it was actually a {} (class {})'.format(
class_names[np.argmax(predictions[0])], np.argmax(predictions[0]), class_names[test_labels[0]], test_labels[0]))