From 0fcc4ab4c80a85c3ece64e44c0984fd0dccd94c6 Mon Sep 17 00:00:00 2001 From: Satya Date: Fri, 9 Jul 2021 04:34:54 -0400 Subject: [PATCH 1/8] Adding TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/BERT/Makefile | 56 ++++++++ .../tensorflow/BERT/python.manifest.template | 78 +++++++++++ .../tensorflow/BERT/root/.keras/keras.json | 0 Examples/tensorflow/README.md | 125 ++++++++++++++++++ Examples/tensorflow/ResNet50/Makefile | 49 +++++++ .../ResNet50/python.manifest.template | 83 ++++++++++++ .../ResNet50/root/.keras/keras.json | 6 + 7 files changed, 397 insertions(+) create mode 100755 Examples/tensorflow/BERT/Makefile create mode 100755 Examples/tensorflow/BERT/python.manifest.template create mode 100644 Examples/tensorflow/BERT/root/.keras/keras.json create mode 100755 Examples/tensorflow/README.md create mode 100755 Examples/tensorflow/ResNet50/Makefile create mode 100755 Examples/tensorflow/ResNet50/python.manifest.template create mode 100644 Examples/tensorflow/ResNet50/root/.keras/keras.json diff --git a/Examples/tensorflow/BERT/Makefile b/Examples/tensorflow/BERT/Makefile new file mode 100755 index 0000000000..e7130b1c7c --- /dev/null +++ b/Examples/tensorflow/BERT/Makefile @@ -0,0 +1,56 @@ +# BERT sample for Tensorflow + +GRAPHENEDIR ?= ../../.. +SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem + +include $(GRAPHENEDIR)/Scripts/Makefile.configs + +ifeq ($(DEBUG),1) +GRAPHENE_LOG_LEVEL = debug +else +GRAPHENE_LOG_LEVEL = error +endif + +.PHONY: all +all: python.manifest +ifeq ($(SGX),1) +all: python.manifest.sgx python.sig python.token +endif + +collateral: + apt install unzip + test -d models || git clone https://github.com/IntelAI/models.git + mkdir -p data + test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip -P data/ + test -d data/wwm_uncased_L-24_H-1024_A-16 || unzip data/wwm_uncased_L-24_H-1024_A-16.zip -d data + test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P data/wwm_uncased_L-24_H-1024_A-16 + test -f data/bert_large_checkpoints.zip || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip -P data/ + test -d data/bert_large_checkpoints || unzip data/bert_large_checkpoints.zip -d data + test -f data/asymmetric_per_channel_bert_int8.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/r2.5-icx-b631821f/asymmetric_per_channel_bert_int8.pb -P data/ + +python.manifest: python.manifest.template collateral + graphene-manifest \ + -Dlog_level=$(GRAPHENE_LOG_LEVEL) \ + -Darch_libdir=$(ARCH_LIBDIR) \ + -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \ + -Dpythondistpath=$(PYTHONDISTPATH) \ + $< >$@ + +python.manifest.sgx: python.manifest + graphene-sgx-sign \ + --key $(SGX_SIGNER_KEY) \ + --manifest $< -output $@ + +python.sig: python.manifest.sgx + +python.token: python.sig + graphene-sgx-get-token -output $@ -sig $< + +.PHONY: clean +clean: + $(RM) *.manifest *.manifest.sgx *.token *.sig + +.PHONY: distclean +distclean: clean + $(RM) -r models/ data/ + diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template new file mode 100755 index 0000000000..9016d94391 --- /dev/null +++ b/Examples/tensorflow/BERT/python.manifest.template @@ -0,0 +1,78 @@ +# This manifest was tested on Ubuntu 18.04 with python3.6. + +libos.entrypoint = "{{ entrypoint }}" +loader.preload = "file:{{ graphene.libos }}" + +# Graphene log level +loader.log_level = "{{ log_level }}" + +# Read application arguments directly from the command line. Don't use this on production! +loader.insecure__use_cmdline_argv = 1 + +# Propagate environment variables from the host. Don't use this on production! +loader.insecure__use_host_env = 1 + +# Disable address space layour randomization. Don't use this on production! +loader.insecure__disable_aslr = 1 + +# Update Library Path - overwrites environment variable +loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" + +# Default glibc files, mounted from graphene's Runtime directory +fs.mount.lib.type = "chroot" +fs.mount.lib.path = "/lib" +fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}" + +# More libraries required by Tensorflow +fs.mount.lib2.type = "chroot" +fs.mount.lib2.path = "{{ arch_libdir }}" +fs.mount.lib2.uri = "file:{{ arch_libdir }}" + +fs.mount.usr.type = "chroot" +fs.mount.usr.path = "/usr" +fs.mount.usr.uri = "file:/usr" + +fs.mount.pyhome.type = "chroot" +fs.mount.pyhome.path = "{{ python.stdlib }}" +fs.mount.pyhome.uri = "file:{{ python.stdlib }}" + +fs.mount.pydisthome.type = "chroot" +fs.mount.pydisthome.path = "{{ python.distlib }}" +fs.mount.pydisthome.uri = "file:{{ python.distlib }}" + +fs.mount.pydistpath.type = "chroot" +fs.mount.pydistpath.path = "{{ pythondistpath }}" +fs.mount.pydistpath.uri = "file:{{ pythondistpath }}" + +fs.mount.tmp.type = "chroot" +fs.mount.tmp.path = "/tmp" +fs.mount.tmp.uri = "file:/tmp" + +fs.mount.etc.type = "chroot" +fs.mount.etc.path = "/etc" +fs.mount.etc.uri = "file:/etc" + +# SGX general options +sgx.enclave_size = "32G" +sgx.thread_num = 256 +sgx.preheat_enclave = 1 +sgx.nonpie_binary = 1 + +# SGX trusted files +sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" +sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/" +sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" +sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6" +sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1" + +sgx.allowed_files.tmp = "file:/tmp" +sgx.allowed_files.etc = "file:/etc" +sgx.allow_file_creation = "1" +sgx.allowed_files.output = "file:output" +sgx.allowed_files.scripts = "file:models/models/language_modeling/tensorflow/bert_large/inference" +sgx.allowed_files.dataDir = "file:data" +sgx.allowed_files.python = "file:{{ entrypoint }}" +sgx.allowed_files.pyhome = "file:{{ python.stdlib }}" +sgx.allowed_files.pydisthome = "file:{{ python.distlib }}" +sgx.allowed_files.pydistpath = "file:{{ pythondistpath }}" +sgx.allowed_files.keras = "file:root/.keras/keras.json" diff --git a/Examples/tensorflow/BERT/root/.keras/keras.json b/Examples/tensorflow/BERT/root/.keras/keras.json new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md new file mode 100755 index 0000000000..7dd5ed1435 --- /dev/null +++ b/Examples/tensorflow/README.md @@ -0,0 +1,125 @@ +## Run inference on TensorFlow BERT and ResNet50 models +This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50 sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference. We tested this on Ubuntu 18.04 and uses the package version of Python 3.6. + +## Pre-System setting +Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency to achieve best performance or to save power based on the requirement. To achieve the best peformance, please set the CPU frequency scaling governor to performance mode. + +``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done`` + +## Pre-requisites +- Install python3.6. +- Upgrade pip/pip3. +- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. + +## Build BERT or ResNet50 samples +- To build BERT sample, do ``cd BERT`` or to build ResNet50 sample, do ``cd ResNet50``. +- To clean the sample, do ``make clean`` +- To clean and remove downloaded models and datasets, do ``make distclean`` +- To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/`` +- To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1`` +>**NOTE** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages', but can change based on python's installation directory. + +## Run inference on BERT model +- To run int8 inference on graphene-sgx(SGX version)
+``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36`` +- To run int8 inference on graphene-direct(non-SGX version)
+``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36`` +- To run int8 inference on native baremetal(outside graphene)
+``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36`` +- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. + - OMP_NUM_THREADS='Core(s) per socket' + - taskset to 'Core(s) per socket' + - intra_op_parallelism_threads='Core(s) per socket' +>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + +## Run inference on ResNet50 model +- To run inference on graphene-sgx(SGX version)
+``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=512 --warmup-steps=50 --steps=500`` +- To run inference on graphene-direct(non-SGX version)
+``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=512 --warmup-steps=50 --steps=500`` +- To run inference on native baremetal(outside graphene)
+``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500`` +- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. + - OMP_NUM_THREADS='Core(s) per socket' + - taskset to 'Core(s) per socket' + - num-intra-threads='Core(s) per socket' +>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + +# GSC : + +## Build graphenize Docker image and run BERT inference : +1. ``cd $(GRAPHENE_DIR)/Tools/gsc`` + +2. Create a configuration file : ``cp config.yaml.template config.yaml`` +Manually adopt config.yaml to the installed Intel SGX driver and desired Graphene repository/version + +3. Generate the signing key : ``openssl genrsa -3 -out enclave-key.pem 3072`` + +4. Build docker image : + - ``cd test`` + - ``docker build --rm -t ubuntu18.04-tensorflow-bert -f ubuntu18.04-tensorflow-bert.dockerfile ../../../Examples`` + +5. Graphenize the docker image using gsc build : + - ``cd ..`` + - ``./gsc build --insecure-args ubuntu18.04-tensorflow-bert test/ubuntu18.04-tensorflow.manifest`` + +6. Sign the graphenized Docker image using gsc sign-image : ``./gsc sign-image ubuntu18.04-tensorflow-bert enclave-key.pem`` + +7. To run int8 inference on GSC
+``docker run --device=/dev/sgx_encalve --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_BLOCKTIME=1 --env KMP_SETTINGS=1 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 gsc-ubuntu18.04-tensorflow-bert models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36 --output_dir=output/bert-squad-output`` + +8. To run int8 inference on native container
+``docker run --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_BLOCKTIME=1 --env KMP_SETTINGS=1 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 ubuntu18.04-tensorflow-bert models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36 --output_dir=output/bert-squad-output`` + +9. Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. + - OMP_NUM_THREADS='Core(s) per socket' + - --cpuset-cpus to 'Core(s) per socket' + - intra_op_parallelism_threads='Core(s) per socket' +>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + +## Build graphenize Docker image and run ResNet50 inference : +1. ``cd $(GRAPHENE_DIR)/Tools/gsc`` + +2. Create a configuration file : ``cp config.yaml.template config.yaml`` +Manually adopt config.yaml to the installed Intel SGX driver and desired Graphene repository/version + +3. Generate the signing key : ``openssl genrsa -3 -out enclave-key.pem 3072`` + +4. Build docker image : + - ``cd test`` + - ``docker build --rm -t ubuntu18.04-tensorflow-resnet50 -f ubuntu18.04-tensorflow-resnet50.dockerfile ../../../Examples`` + +5. Graphenize the docker image using gsc build : + - ``cd ..`` + - ``./gsc build --insecure-args ubuntu18.04-tensorflow-resnet50 test/ubuntu18.04-tensorflow.manifest`` + +6. Sign the graphenized Docker image using gsc sign-image : ``./gsc sign-image ubuntu18.04-tensorflow-resnet50 enclave-key.pem`` + +7. To run inference on GSC
+``docker run --device=/dev/sgx_enclave --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 gsc-ubuntu18.04-tensorflow-resnet50 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500`` + > **NOTE**: When OOM happens pass option ``-env TF_MKL_ALLOC_MAX_BYTES=34359738368`` to docker run command. +8. To run inference on native Container
+``docker run --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 ubuntu18.04-tensorflow-resnet50 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500`` + +9. Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. + - OMP_NUM_THREADS='Core(s) per socket' + - --cpuset-cpus to 'Core(s) per socket' + - num-intra-threads='Core(s) per socket' +>**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + +## Performance considerations +- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to graphene-sgx invocation (before the workload starts executing). To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template. +- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help improve performance significantly based on the workloads. At any point, only one of these allocators can be used. + - TCMalloc (Please update the binary location and name if different from default) + - Install tcmalloc : sudo apt-get install google-perftools + - Add these in the manifest template
+ ``loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``
+ ``sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``
+ ``sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"`` + - Save the template and rebuild. + - mimalloc (Please update the binary location and name if different from default) + - Install mimalloc using the steps from https://github.com/microsoft/mimalloc + - Add these in the manifest template
+ ``loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``
+ ``sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"`` + - Save the template and rebuild. diff --git a/Examples/tensorflow/ResNet50/Makefile b/Examples/tensorflow/ResNet50/Makefile new file mode 100755 index 0000000000..035a262de5 --- /dev/null +++ b/Examples/tensorflow/ResNet50/Makefile @@ -0,0 +1,49 @@ +# ResNet50 sample for Tensorflow + +GRAPHENEDIR ?= ../../.. +SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem + +include $(GRAPHENEDIR)/Scripts/Makefile.configs + +ifeq ($(DEBUG),1) +GRAPHENE_LOG_LEVEL = debug +else +GRAPHENE_LOG_LEVEL = error +endif + +.PHONY: all collateral +all: python.manifest +ifeq ($(SGX),1) +all: python.manifest.sgx python.sig python.token +endif + +collateral: + test -d models || git clone https://github.com/IntelAI/models.git + test -f resnet50v1_5_int8_pretrained_model.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/resnet50v1_5_int8_pretrained_model.pb + +python.manifest: python.manifest.template collateral + graphene-manifest \ + -Dlog_level=$(GRAPHENE_LOG_LEVEL) \ + -Darch_libdir=$(ARCH_LIBDIR) \ + -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \ + -Dpythondistpath=$(PYTHONDISTPATH) \ + $< >$@ + +python.manifest.sgx: python.manifest + graphene-sgx-sign \ + --key $(SGX_SIGNER_KEY) \ + --manifest python.manifest \ + --output $@ + +python.sig: python.manifest.sgx + +python.token: python.sig + graphene-sgx-get-token -output $@ -sig $< + +.PHONY: clean +clean: + $(RM) *.manifest *.manifest.sgx *.token *.sig + +.PHONY: distclean +distclean: clean + $(RM) -r models/ resnet50v1_5_int8_pretrained_model.pb diff --git a/Examples/tensorflow/ResNet50/python.manifest.template b/Examples/tensorflow/ResNet50/python.manifest.template new file mode 100755 index 0000000000..1f50a0bb12 --- /dev/null +++ b/Examples/tensorflow/ResNet50/python.manifest.template @@ -0,0 +1,83 @@ +# This manifest was tested on Ubuntu 18.04 with python3.6. + +libos.entrypoint = "{{ entrypoint }}" +loader.preload = "file:{{ graphene.libos }}" + +# Graphene log level +loader.log_level = "{{ log_level }}" + +# Read application arguments directly from the command line. Don't use this on production! +loader.insecure__use_cmdline_argv = 1 + +# Propagate environment variables from the host. Don't use this on production! +loader.insecure__use_host_env = 1 + +# Disable address space layour randomization. Don't use this on production! +loader.insecure__disable_aslr = 1 + +# Update Library Path - overwrites environment variable +loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" + +# Default glibc files, mounted from graphene's Runtime directory +fs.mount.lib.type = "chroot" +fs.mount.lib.path = "/lib" +fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}" + +# More libraries required by Tensorflow +fs.mount.lib2.type = "chroot" +fs.mount.lib2.path = "{{ arch_libdir }}" +fs.mount.lib2.uri = "file:{{ arch_libdir }}" + +fs.mount.usr.type = "chroot" +fs.mount.usr.path = "/usr" +fs.mount.usr.uri = "file:/usr" + +fs.mount.bin.type = "chroot" +fs.mount.bin.path = "/bin" +fs.mount.bin.uri = "file:/bin" + +fs.mount.pyhome.type = "chroot" +fs.mount.pyhome.path = "{{ python.stdlib }}" +fs.mount.pyhome.uri = "file:{{ python.stdlib }}" + +fs.mount.pydisthome.type = "chroot" +fs.mount.pydisthome.path = "{{ python.distlib }}" +fs.mount.pydisthome.uri = "file:{{ python.distlib }}" + +fs.mount.pydistpath.type = "chroot" +fs.mount.pydistpath.path = "{{ pythondistpath }}" +fs.mount.pydistpath.uri = "file:{{ pythondistpath }}" + +fs.mount.tmp.type = "chroot" +fs.mount.tmp.path = "/tmp" +fs.mount.tmp.uri = "file:/tmp" + +fs.mount.etc.type = "chroot" +fs.mount.etc.path = "/etc" +fs.mount.etc.uri = "file:/etc" + +# SGX general options +sgx.enclave_size = "32G" +sgx.thread_num = 300 +sgx.preheat_enclave = 1 +sgx.nonpie_binary = 1 + +# SGX trusted files +sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" +sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/" +sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" +sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6" +sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1" +sgx.trusted_files.model = "file:resnet50v1_5_int8_pretrained_model.pb" + +sgx.allowed_files.tmp = "file:/tmp" +sgx.allowed_files.etc = "file:/etc" +sgx.allow_file_creation = "1" +sgx.allowed_files.proc = "file:/proc" +sgx.allowed_files.cpuinfo = "file:/proc/cpuinfo" +sgx.allowed_files.scripts = "file:models/models/image_recognition/tensorflow/resnet50v1_5/inference" +sgx.allowed_files.python = "file:{{ entrypoint }}" +sgx.allowed_files.pyhome = "file:{{ python.stdlib }}" +sgx.allowed_files.pydisthome = "file:{{ python.distlib }}" +sgx.allowed_files.pydistpath = "file:{{ pythondistpath }}" +sgx.allowed_files.keras = "file:root/.keras/keras.json" diff --git a/Examples/tensorflow/ResNet50/root/.keras/keras.json b/Examples/tensorflow/ResNet50/root/.keras/keras.json new file mode 100644 index 0000000000..bc2cae3746 --- /dev/null +++ b/Examples/tensorflow/ResNet50/root/.keras/keras.json @@ -0,0 +1,6 @@ +{ + "floatx": "float32", + "epsilon": 1e-07, + "backend": "tensorflow", + "image_data_format": "channels_last" +} \ No newline at end of file From 13a09c247ea0a7d11b0d69c479c0d5c7655912a3 Mon Sep 17 00:00:00 2001 From: Satya Date: Fri, 9 Jul 2021 05:20:42 -0400 Subject: [PATCH 2/8] Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/BERT/python.manifest.template | 10 +++++----- Examples/tensorflow/BERT/root/.keras/keras.json | 6 ++++++ Examples/tensorflow/ResNet50/python.manifest.template | 10 +++++----- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template index 9016d94391..9e4316b7cf 100755 --- a/Examples/tensorflow/BERT/python.manifest.template +++ b/Examples/tensorflow/BERT/python.manifest.template @@ -65,12 +65,12 @@ sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6" sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1" -sgx.allowed_files.tmp = "file:/tmp" -sgx.allowed_files.etc = "file:/etc" +sgx.allowed_files.tmp = "file:/tmp/" +sgx.allowed_files.etc = "file:/etc/" sgx.allow_file_creation = "1" -sgx.allowed_files.output = "file:output" -sgx.allowed_files.scripts = "file:models/models/language_modeling/tensorflow/bert_large/inference" -sgx.allowed_files.dataDir = "file:data" +sgx.allowed_files.output = "file:output/" +sgx.allowed_files.scripts = "file:models/models/language_modeling/tensorflow/bert_large/inference/" +sgx.allowed_files.dataDir = "file:data/" sgx.allowed_files.python = "file:{{ entrypoint }}" sgx.allowed_files.pyhome = "file:{{ python.stdlib }}" sgx.allowed_files.pydisthome = "file:{{ python.distlib }}" diff --git a/Examples/tensorflow/BERT/root/.keras/keras.json b/Examples/tensorflow/BERT/root/.keras/keras.json index e69de29bb2..bc2cae3746 100644 --- a/Examples/tensorflow/BERT/root/.keras/keras.json +++ b/Examples/tensorflow/BERT/root/.keras/keras.json @@ -0,0 +1,6 @@ +{ + "floatx": "float32", + "epsilon": 1e-07, + "backend": "tensorflow", + "image_data_format": "channels_last" +} \ No newline at end of file diff --git a/Examples/tensorflow/ResNet50/python.manifest.template b/Examples/tensorflow/ResNet50/python.manifest.template index 1f50a0bb12..74815e9c81 100755 --- a/Examples/tensorflow/ResNet50/python.manifest.template +++ b/Examples/tensorflow/ResNet50/python.manifest.template @@ -70,12 +70,12 @@ sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6" sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1" sgx.trusted_files.model = "file:resnet50v1_5_int8_pretrained_model.pb" -sgx.allowed_files.tmp = "file:/tmp" -sgx.allowed_files.etc = "file:/etc" +sgx.allowed_files.tmp = "file:/tmp/" +sgx.allowed_files.etc = "file:/etc/" sgx.allow_file_creation = "1" -sgx.allowed_files.proc = "file:/proc" -sgx.allowed_files.cpuinfo = "file:/proc/cpuinfo" -sgx.allowed_files.scripts = "file:models/models/image_recognition/tensorflow/resnet50v1_5/inference" +sgx.allowed_files.proc = "file:/proc/" +sgx.allowed_files.cpuinfo = "file:/proc/cpuinfo/" +sgx.allowed_files.scripts = "file:models/models/image_recognition/tensorflow/resnet50v1_5/inference/" sgx.allowed_files.python = "file:{{ entrypoint }}" sgx.allowed_files.pyhome = "file:{{ python.stdlib }}" sgx.allowed_files.pydisthome = "file:{{ python.distlib }}" From 902dc25c012c93828afc36f1f1cfd8dcb5cdb84c Mon Sep 17 00:00:00 2001 From: Satya Date: Mon, 12 Jul 2021 00:47:48 -0400 Subject: [PATCH 3/8] fixup! Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/BERT/Makefile | 24 +- .../tensorflow/BERT/python.manifest.template | 19 +- Examples/tensorflow/README.md | 247 +++++++++++------- .../ResNet50/python.manifest.template | 20 +- 4 files changed, 180 insertions(+), 130 deletions(-) diff --git a/Examples/tensorflow/BERT/Makefile b/Examples/tensorflow/BERT/Makefile index e7130b1c7c..a50bd4a050 100755 --- a/Examples/tensorflow/BERT/Makefile +++ b/Examples/tensorflow/BERT/Makefile @@ -17,24 +17,29 @@ ifeq ($(SGX),1) all: python.manifest.sgx python.sig python.token endif +BERT_DATASET = https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip +SQUAAD_DATASET = https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json +CHECKPOINTS = https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip +BERT_INT8_MODEL = https://storage.googleapis.com/intel-optimized-tensorflow/models/r2.5-icx-b631821f/asymmetric_per_channel_bert_int8.pb + collateral: apt install unzip test -d models || git clone https://github.com/IntelAI/models.git mkdir -p data - test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip -P data/ + test -f data/wwm_uncased_L-24_H-1024_A-16.zip || wget $(BERT_DATASET) -P data/ test -d data/wwm_uncased_L-24_H-1024_A-16 || unzip data/wwm_uncased_L-24_H-1024_A-16.zip -d data - test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P data/wwm_uncased_L-24_H-1024_A-16 - test -f data/bert_large_checkpoints.zip || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip -P data/ + test -f data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json || wget $(SQUAAD_DATASET) -P data/wwm_uncased_L-24_H-1024_A-16 + test -f data/bert_large_checkpoints.zip || wget $(CHECKPOINTS) -P data/ test -d data/bert_large_checkpoints || unzip data/bert_large_checkpoints.zip -d data - test -f data/asymmetric_per_channel_bert_int8.pb || wget https://storage.googleapis.com/intel-optimized-tensorflow/models/r2.5-icx-b631821f/asymmetric_per_channel_bert_int8.pb -P data/ + test -f data/asymmetric_per_channel_bert_int8.pb || wget $(BERT_INT8_MODEL) -P data/ python.manifest: python.manifest.template collateral graphene-manifest \ - -Dlog_level=$(GRAPHENE_LOG_LEVEL) \ - -Darch_libdir=$(ARCH_LIBDIR) \ - -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \ - -Dpythondistpath=$(PYTHONDISTPATH) \ - $< >$@ + -Dlog_level=$(GRAPHENE_LOG_LEVEL) \ + -Darch_libdir=$(ARCH_LIBDIR) \ + -Dentrypoint=$(realpath $(shell sh -c "command -v python3")) \ + -Dpythondistpath=$(PYTHONDISTPATH) \ + $< >$@ python.manifest.sgx: python.manifest graphene-sgx-sign \ @@ -53,4 +58,3 @@ clean: .PHONY: distclean distclean: clean $(RM) -r models/ data/ - diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template index 9e4316b7cf..fd769cf161 100755 --- a/Examples/tensorflow/BERT/python.manifest.template +++ b/Examples/tensorflow/BERT/python.manifest.template @@ -12,12 +12,15 @@ loader.insecure__use_cmdline_argv = 1 # Propagate environment variables from the host. Don't use this on production! loader.insecure__use_host_env = 1 -# Disable address space layour randomization. Don't use this on production! +# Disable address space layout randomization. Don't use this on production! loader.insecure__disable_aslr = 1 # Update Library Path - overwrites environment variable loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" +# Additional memory for Graphene's internal use +loader.pal_internal_mem_size = "512M" + # Default glibc files, mounted from graphene's Runtime directory fs.mount.lib.type = "chroot" fs.mount.lib.path = "/lib" @@ -62,17 +65,15 @@ sgx.nonpie_binary = 1 sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/" sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" -sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6" -sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1" +sgx.trusted_files.python = "file:{{ entrypoint }}" +sgx.trusted_files.pyhome = "file:{{ python.stdlib }}" +sgx.trusted_files.pydisthome = "file:{{ python.distlib }}" +sgx.trusted_files.pydistpath = "file:{{ pythondistpath }}" +# SGX allowed files sgx.allowed_files.tmp = "file:/tmp/" sgx.allowed_files.etc = "file:/etc/" -sgx.allow_file_creation = "1" sgx.allowed_files.output = "file:output/" -sgx.allowed_files.scripts = "file:models/models/language_modeling/tensorflow/bert_large/inference/" +sgx.allowed_files.scripts = "file:models/" sgx.allowed_files.dataDir = "file:data/" -sgx.allowed_files.python = "file:{{ entrypoint }}" -sgx.allowed_files.pyhome = "file:{{ python.stdlib }}" -sgx.allowed_files.pydisthome = "file:{{ python.distlib }}" -sgx.allowed_files.pydistpath = "file:{{ pythondistpath }}" sgx.allowed_files.keras = "file:root/.keras/keras.json" diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md index 7dd5ed1435..0d269d1d3a 100755 --- a/Examples/tensorflow/README.md +++ b/Examples/tensorflow/README.md @@ -1,125 +1,170 @@ -## Run inference on TensorFlow BERT and ResNet50 models -This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50 sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference. We tested this on Ubuntu 18.04 and uses the package version of Python 3.6. +## Inference on TensorFlow BERT and ResNet50 models +This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50\ +sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference.\ +We tested this on Ubuntu 18.04 and uses the package version with Python 3.6. + +### Bidirectional Encoder Representations from Transformers (BERT): +BERT is a method of pre-training language representations and then use that trained model for downstream\ +NLP tasks like 'question answering'. BERT is an unsupervised, deeply birectional system for pre-training NLP.\ +In this BERT sample, we use 'BERT-Large, Uncased (Whole Word Masking)' model and perform int8 inference.\ +More details about BERT can be found at https://github.com/google-research/bert. + +### Residual Network (ResNet): +ResNet50 is a convolutional neural network that is 50 layers deep.\ +In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference.\ +More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5. + ## Pre-System setting -Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency to achieve best performance or to save power based on the requirement. To achieve the best peformance, please set the CPU frequency scaling governor to performance mode. +Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency\ +to achieve best performance or to save power based on the requirement. +To achieve the best peformance, please set the CPU frequency scaling governor to performance mode. ``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done`` ## Pre-requisites - Install python3.6. - Upgrade pip/pip3. -- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. +- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl\ +package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. ## Build BERT or ResNet50 samples - To build BERT sample, do ``cd BERT`` or to build ResNet50 sample, do ``cd ResNet50``. - To clean the sample, do ``make clean`` -- To clean and remove downloaded models and datasets, do ``make distclean`` +- To clean and remove downloaded models and datasets, do ``make distclean`` - To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/`` - To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1`` ->**NOTE** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages', but can change based on python's installation directory. +>**WARNING:** Building BERT sample downloads about 5GB of data.\ +>**NOTE:** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages',\ +but can change based on python's installation directory. ## Run inference on BERT model -- To run int8 inference on graphene-sgx(SGX version)
-``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36`` -- To run int8 inference on graphene-direct(non-SGX version)
-``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36`` -- To run int8 inference on native baremetal(outside graphene)
-``KMP_BLOCKTIME=1 KMP_SETTINGS=1 OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --output_dir=output/bert-squad-output --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36`` +- To run int8 inference on graphene-sgx(SGX version) +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \ +./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ +--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \ +--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \ +--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \ +--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \ +--precision=int8 \ +--output_dir=output/bert-squad-output \ +--predict_batch_size=32 \ +--experimental_gelu=True \ +--optimized_softmax=True \ +--input_graph=data/asymmetric_per_channel_bert_int8.pb \ +--do_predict=True --mode=benchmark \ +--inter_op_parallelism_threads=1 \ +--intra_op_parallelism_threads=36 +``` +- To run int8 inference on graphene-direct(non-SGX version) +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 \ +graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ +--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \ +--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \ +--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \ +--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \ +--precision=int8 \ +--output_dir=output/bert-squad-output \ +--predict_batch_size=32 \ +--experimental_gelu=True \ +--optimized_softmax=True \ +--input_graph=data/asymmetric_per_channel_bert_int8.pb \ +--do_predict=True \ +--mode=benchmark \ +--inter_op_parallelism_threads=1 \ +--intra_op_parallelism_threads=36 +``` +- To run int8 inference on native baremetal(outside graphene) +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \ +models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ +--init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \ +--vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \ +--bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \ +--predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \ +--precision=int8 --output_dir=output/bert-squad-output \ +--predict_batch_size=32 \ +--experimental_gelu=True \ +--optimized_softmax=True \ +--input_graph=data/asymmetric_per_channel_bert_int8.pb \ +--do_predict=True \ +--mode=benchmark \ +--inter_op_parallelism_threads=1 \ +--intra_op_parallelism_threads=36 +``` - Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. - - OMP_NUM_THREADS='Core(s) per socket' - - taskset to 'Core(s) per socket' - - intra_op_parallelism_threads='Core(s) per socket' ->**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + - OMP_NUM_THREADS='Core(s) per socket' + - taskset to 'Core(s) per socket' + - intra_op_parallelism_threads='Core(s) per socket' + - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` + - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact`` +>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\ +> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions. \ +> KMP_AFFINITY binds OpenMP threads to physical processing units. ## Run inference on ResNet50 model -- To run inference on graphene-sgx(SGX version)
-``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=512 --warmup-steps=50 --steps=500`` -- To run inference on graphene-direct(non-SGX version)
-``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=512 --warmup-steps=50 --steps=500`` -- To run inference on native baremetal(outside graphene)
-``OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500`` +- To run inference on graphene-sgx(SGX version) +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \ +./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ +--input-graph=resnet50v1_5_int8_pretrained_model.pb \ +--num-inter-threads=1 \ +--num-intra-threads=36 \ +--batch-size=32 \ +--warmup-steps=50 \ +--steps=500 +``` +- To run inference on graphene-direct(non-SGX version) +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct \ +./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ +--input-graph=resnet50v1_5_int8_pretrained_model.pb \ +--num-inter-threads=1 \ +--num-intra-threads=36 \ +--batch-size=32 \ +--warmup-steps=50 \ +--steps=500 +``` +- To run inference on native baremetal(outside graphene) +``` +OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \ +models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ +--input-graph=resnet50v1_5_int8_pretrained_model.pb \ +--num-inter-threads=1 \ +--num-intra-threads=36 \ +--batch-size=32 \ +--warmup-steps=50 \ +--steps=500 +``` - Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. - - OMP_NUM_THREADS='Core(s) per socket' - - taskset to 'Core(s) per socket' - - num-intra-threads='Core(s) per socket' ->**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` - -# GSC : - -## Build graphenize Docker image and run BERT inference : -1. ``cd $(GRAPHENE_DIR)/Tools/gsc`` - -2. Create a configuration file : ``cp config.yaml.template config.yaml`` -Manually adopt config.yaml to the installed Intel SGX driver and desired Graphene repository/version - -3. Generate the signing key : ``openssl genrsa -3 -out enclave-key.pem 3072`` - -4. Build docker image : - - ``cd test`` - - ``docker build --rm -t ubuntu18.04-tensorflow-bert -f ubuntu18.04-tensorflow-bert.dockerfile ../../../Examples`` - -5. Graphenize the docker image using gsc build : - - ``cd ..`` - - ``./gsc build --insecure-args ubuntu18.04-tensorflow-bert test/ubuntu18.04-tensorflow.manifest`` - -6. Sign the graphenized Docker image using gsc sign-image : ``./gsc sign-image ubuntu18.04-tensorflow-bert enclave-key.pem`` - -7. To run int8 inference on GSC
-``docker run --device=/dev/sgx_encalve --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_BLOCKTIME=1 --env KMP_SETTINGS=1 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 gsc-ubuntu18.04-tensorflow-bert models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36 --output_dir=output/bert-squad-output`` - -8. To run int8 inference on native container
-``docker run --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_BLOCKTIME=1 --env KMP_SETTINGS=1 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 ubuntu18.04-tensorflow-bert models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py --init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 --vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt --bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json --predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json --precision=int8 --predict_batch_size=32 --experimental_gelu=True --optimized_softmax=True --input_graph=data/asymmetric_per_channel_bert_int8.pb --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=36 --output_dir=output/bert-squad-output`` - -9. Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. - - OMP_NUM_THREADS='Core(s) per socket' - - --cpuset-cpus to 'Core(s) per socket' - - intra_op_parallelism_threads='Core(s) per socket' ->**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` - -## Build graphenize Docker image and run ResNet50 inference : -1. ``cd $(GRAPHENE_DIR)/Tools/gsc`` - -2. Create a configuration file : ``cp config.yaml.template config.yaml`` -Manually adopt config.yaml to the installed Intel SGX driver and desired Graphene repository/version - -3. Generate the signing key : ``openssl genrsa -3 -out enclave-key.pem 3072`` - -4. Build docker image : - - ``cd test`` - - ``docker build --rm -t ubuntu18.04-tensorflow-resnet50 -f ubuntu18.04-tensorflow-resnet50.dockerfile ../../../Examples`` - -5. Graphenize the docker image using gsc build : - - ``cd ..`` - - ``./gsc build --insecure-args ubuntu18.04-tensorflow-resnet50 test/ubuntu18.04-tensorflow.manifest`` - -6. Sign the graphenized Docker image using gsc sign-image : ``./gsc sign-image ubuntu18.04-tensorflow-resnet50 enclave-key.pem`` - -7. To run inference on GSC
-``docker run --device=/dev/sgx_enclave --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 gsc-ubuntu18.04-tensorflow-resnet50 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500`` - > **NOTE**: When OOM happens pass option ``-env TF_MKL_ALLOC_MAX_BYTES=34359738368`` to docker run command. -8. To run inference on native Container
-``docker run --cpuset-cpus="0-35" --env OMP_NUM_THREADS=36 --env KMP_AFFINITY=granularity=fine,noverbose,compact,1,0 ubuntu18.04-tensorflow-resnet50 models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=36 --batch-size=128 --warmup-steps=50 --steps=500`` - -9. Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. - - OMP_NUM_THREADS='Core(s) per socket' - - --cpuset-cpus to 'Core(s) per socket' - - num-intra-threads='Core(s) per socket' ->**NOTE** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + - OMP_NUM_THREADS='Core(s) per socket' + - taskset to 'Core(s) per socket' + - num-intra-threads='Core(s) per socket' + - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` + - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact`` + - The options batch-size, warmup-steps and steps can be varied. +>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\ +> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions.\ +> KMP_AFFINITY binds OpenMP threads to physical processing units. ## Performance considerations -- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to graphene-sgx invocation (before the workload starts executing). To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template. -- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help improve performance significantly based on the workloads. At any point, only one of these allocators can be used. +- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to\ +graphene-sgx invocation (before the workload starts execution).\ +To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template. +- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help\ +improve performance significantly based on the workloads. At any point, only one of these allocators can be used. - TCMalloc (Please update the binary location and name if different from default) - - Install tcmalloc : sudo apt-get install google-perftools - - Add these in the manifest template
- ``loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``
- ``sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``
- ``sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"`` - - Save the template and rebuild. + - Install tcmalloc : ``sudo apt-get install google-perftools`` + - Add these in the manifest template + ```loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``` + ```sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``` + ```sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"``` + - Save the template and rebuild. - mimalloc (Please update the binary location and name if different from default) - - Install mimalloc using the steps from https://github.com/microsoft/mimalloc - - Add these in the manifest template
- ``loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``
- ``sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"`` - - Save the template and rebuild. + - Install mimalloc using the steps from https://github.com/microsoft/mimalloc + - Add these in the manifest template + ```loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``` + ```sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``` + - Save the template and rebuild. diff --git a/Examples/tensorflow/ResNet50/python.manifest.template b/Examples/tensorflow/ResNet50/python.manifest.template index 74815e9c81..857023154b 100755 --- a/Examples/tensorflow/ResNet50/python.manifest.template +++ b/Examples/tensorflow/ResNet50/python.manifest.template @@ -12,12 +12,15 @@ loader.insecure__use_cmdline_argv = 1 # Propagate environment variables from the host. Don't use this on production! loader.insecure__use_host_env = 1 -# Disable address space layour randomization. Don't use this on production! +# Disable address space layout randomization. Don't use this on production! loader.insecure__disable_aslr = 1 # Update Library Path - overwrites environment variable loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" +# Additional memory for Graphene's internal use +loader.pal_internal_mem_size = "512M" + # Default glibc files, mounted from graphene's Runtime directory fs.mount.lib.type = "chroot" fs.mount.lib.path = "/lib" @@ -66,18 +69,15 @@ sgx.nonpie_binary = 1 sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/" sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" -sgx.trusted_files.libcpp = "file:/usr/lib/x86_64-linux-gnu/libstdc++.so.6" -sgx.trusted_files.libgcc = "file:/lib/x86_64-linux-gnu/libgcc_s.so.1" sgx.trusted_files.model = "file:resnet50v1_5_int8_pretrained_model.pb" +sgx.trusted_files.python = "file:{{ entrypoint }}" +sgx.trusted_files.pyhome = "file:{{ python.stdlib }}" +sgx.trusted_files.pydisthome = "file:{{ python.distlib }}" +sgx.trusted_files.pydistpath = "file:{{ pythondistpath }}" +# SGX allowed files sgx.allowed_files.tmp = "file:/tmp/" sgx.allowed_files.etc = "file:/etc/" -sgx.allow_file_creation = "1" sgx.allowed_files.proc = "file:/proc/" -sgx.allowed_files.cpuinfo = "file:/proc/cpuinfo/" -sgx.allowed_files.scripts = "file:models/models/image_recognition/tensorflow/resnet50v1_5/inference/" -sgx.allowed_files.python = "file:{{ entrypoint }}" -sgx.allowed_files.pyhome = "file:{{ python.stdlib }}" -sgx.allowed_files.pydisthome = "file:{{ python.distlib }}" -sgx.allowed_files.pydistpath = "file:{{ pythondistpath }}" +sgx.allowed_files.scripts = "file:models/" sgx.allowed_files.keras = "file:root/.keras/keras.json" From b8c1bb8bc267b8318d71faacae8e09bbdbacd82a Mon Sep 17 00:00:00 2001 From: Satya Date: Mon, 19 Jul 2021 22:59:31 -0400 Subject: [PATCH 4/8] fixup! fixup! Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/README.md | 81 +++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md index 0d269d1d3a..62ee523f6d 100755 --- a/Examples/tensorflow/README.md +++ b/Examples/tensorflow/README.md @@ -1,22 +1,22 @@ ## Inference on TensorFlow BERT and ResNet50 models -This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50\ -sample workloads on Graphene. Specifically, both these examples use pre-trained models to run inference.\ -We tested this on Ubuntu 18.04 and uses the package version with Python 3.6. +This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50 +sample workloads on Graphene. Specifically, both these examples use pre-trained models to run +inference. We tested this on Ubuntu 18.04 and uses the package version with Python 3.6. ### Bidirectional Encoder Representations from Transformers (BERT): -BERT is a method of pre-training language representations and then use that trained model for downstream\ -NLP tasks like 'question answering'. BERT is an unsupervised, deeply birectional system for pre-training NLP.\ -In this BERT sample, we use 'BERT-Large, Uncased (Whole Word Masking)' model and perform int8 inference.\ -More details about BERT can be found at https://github.com/google-research/bert. +BERT is a method of pre-training language representations and then use that trained model for +downstream NLP tasks like 'question answering'. BERT is an unsupervised, deeply birectional system +for pre-training NLP. +In this BERT sample, we use 'BERT-Large, Uncased (Whole Word Masking)' model and perform int8 +inference. More details about BERT can be found at https://github.com/google-research/bert. ### Residual Network (ResNet): -ResNet50 is a convolutional neural network that is 50 layers deep.\ -In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference.\ +ResNet50 is a convolutional neural network that is 50 layers deep. +In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference. More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5. - ## Pre-System setting -Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency\ +Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency to achieve best performance or to save power based on the requirement. To achieve the best peformance, please set the CPU frequency scaling governor to performance mode. @@ -25,7 +25,7 @@ To achieve the best peformance, please set the CPU frequency scaling governor to ## Pre-requisites - Install python3.6. - Upgrade pip/pip3. -- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl\ +- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. ## Build BERT or ResNet50 samples @@ -34,9 +34,9 @@ package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. - To clean and remove downloaded models and datasets, do ``make distclean`` - To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/`` - To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1`` ->**WARNING:** Building BERT sample downloads about 5GB of data.\ ->**NOTE:** Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages',\ -but can change based on python's installation directory. +- Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages', but can +change based on python's installation directory. +>**WARNING:** Building BERT sample downloads about 5GB of data. ## Run inference on BERT model - To run int8 inference on graphene-sgx(SGX version) @@ -94,15 +94,16 @@ models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ --inter_op_parallelism_threads=1 \ --intra_op_parallelism_threads=36 ``` -- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. - - OMP_NUM_THREADS='Core(s) per socket' +- Above commands are for a 36 core system. Please set the following options accordingly for optimal + performance. + - OMP_NUM_THREADS='Core(s) per socket', OMP_NUM_THREADS sets the maximum number of threads to + use for OpenMP parallel regions. - taskset to 'Core(s) per socket' - intra_op_parallelism_threads='Core(s) per socket' - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact`` ->**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\ -> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions. \ -> KMP_AFFINITY binds OpenMP threads to physical processing units. + - KMP_AFFINITY binds OpenMP threads to physical processing units. +>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` ## Run inference on ResNet50 model - To run inference on graphene-sgx(SGX version) @@ -138,33 +139,37 @@ models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_cla --warmup-steps=50 \ --steps=500 ``` -- Above commands are for a 36 core system. Please set the following options accordingly for optimal performance. - - OMP_NUM_THREADS='Core(s) per socket' +- Above commands are for a 36 core system. Please set the following options accordingly for optimal + performance. + - OMP_NUM_THREADS='Core(s) per socket', OMP_NUM_THREADS sets the maximum number of threads to + use for OpenMP parallel regions. - taskset to 'Core(s) per socket' - num-intra-threads='Core(s) per socket' - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact`` + - KMP_AFFINITY binds OpenMP threads to physical processing units. - The options batch-size, warmup-steps and steps can be varied. ->**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'``\ -> OMP_NUM_THREADS sets the maximum number of threads to use for OpenMP parallel regions.\ -> KMP_AFFINITY binds OpenMP threads to physical processing units. +>**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` ## Performance considerations -- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to\ -graphene-sgx invocation (before the workload starts execution).\ +- Preheat manifest option pre-faults the enclave memory and moves the performance penalty to +graphene-sgx invocation (before the workload starts execution). To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template. -- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help\ -improve performance significantly based on the workloads. At any point, only one of these allocators can be used. +- TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help + improve performance significantly based on the workloads. At any point, only one of these + allocators can be used. - TCMalloc (Please update the binary location and name if different from default) - Install tcmalloc : ``sudo apt-get install google-perftools`` - - Add these in the manifest template - ```loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``` - ```sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"``` - ```sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"``` - - Save the template and rebuild. + - Add the following lines in the manifest template and rebuild the sample. +``` +loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" +sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" +sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8" +``` - mimalloc (Please update the binary location and name if different from default) - Install mimalloc using the steps from https://github.com/microsoft/mimalloc - - Add these in the manifest template - ```loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``` - ```sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"``` - - Save the template and rebuild. + - Add the following lines in the manifest template and rebuild the sample. +``` +loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7" +sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7" +``` From b891ded309b7ef09ccf5e329b99d708b65542c4e Mon Sep 17 00:00:00 2001 From: Satya Date: Fri, 23 Jul 2021 00:49:17 -0400 Subject: [PATCH 5/8] fixup! fixup! fixup! Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- .../tensorflow/BERT/python.manifest.template | 12 +-- Examples/tensorflow/README.md | 91 ++++++++++--------- .../ResNet50/python.manifest.template | 12 +-- 3 files changed, 56 insertions(+), 59 deletions(-) diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template index fd769cf161..350de0079a 100755 --- a/Examples/tensorflow/BERT/python.manifest.template +++ b/Examples/tensorflow/BERT/python.manifest.template @@ -1,5 +1,3 @@ -# This manifest was tested on Ubuntu 18.04 with python3.6. - libos.entrypoint = "{{ entrypoint }}" loader.preload = "file:{{ graphene.libos }}" @@ -7,13 +5,13 @@ loader.preload = "file:{{ graphene.libos }}" loader.log_level = "{{ log_level }}" # Read application arguments directly from the command line. Don't use this on production! -loader.insecure__use_cmdline_argv = 1 +loader.insecure__use_cmdline_argv = true # Propagate environment variables from the host. Don't use this on production! -loader.insecure__use_host_env = 1 +loader.insecure__use_host_env = true # Disable address space layout randomization. Don't use this on production! -loader.insecure__disable_aslr = 1 +loader.insecure__disable_aslr = true # Update Library Path - overwrites environment variable loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" @@ -58,8 +56,8 @@ fs.mount.etc.uri = "file:/etc" # SGX general options sgx.enclave_size = "32G" sgx.thread_num = 256 -sgx.preheat_enclave = 1 -sgx.nonpie_binary = 1 +sgx.preheat_enclave = true +sgx.nonpie_binary = true # SGX trusted files sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md index 62ee523f6d..93ea39a508 100755 --- a/Examples/tensorflow/README.md +++ b/Examples/tensorflow/README.md @@ -1,7 +1,7 @@ ## Inference on TensorFlow BERT and ResNet50 models This directory contains steps and artifacts to run inference with TensorFlow BERT and ResNet50 sample workloads on Graphene. Specifically, both these examples use pre-trained models to run -inference. We tested this on Ubuntu 18.04 and uses the package version with Python 3.6. +inference. ### Bidirectional Encoder Representations from Transformers (BERT): BERT is a method of pre-training language representations and then use that trained model for @@ -15,15 +15,7 @@ ResNet50 is a convolutional neural network that is 50 layers deep. In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference. More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5. -## Pre-System setting -Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency -to achieve best performance or to save power based on the requirement. -To achieve the best peformance, please set the CPU frequency scaling governor to performance mode. - -``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done`` - ## Pre-requisites -- Install python3.6. - Upgrade pip/pip3. - Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. @@ -36,10 +28,13 @@ package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. - To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1`` - Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages', but can change based on python's installation directory. ->**WARNING:** Building BERT sample downloads about 5GB of data. +- Keras settings are configured in the file root/.keras/keras.json. It is configured to use +tensorflow as backend. + +**WARNING:** Building BERT sample downloads about 5GB of data. ## Run inference on BERT model -- To run int8 inference on graphene-sgx(SGX version) +- To run int8 inference on graphene-sgx (SGX version) ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \ ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ @@ -57,7 +52,7 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --inter_op_parallelism_threads=1 \ --intra_op_parallelism_threads=36 ``` -- To run int8 inference on graphene-direct(non-SGX version) +- To run int8 inference on graphene-direct (non-SGX version) ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 \ graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ @@ -76,7 +71,7 @@ graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/i --inter_op_parallelism_threads=1 \ --intra_op_parallelism_threads=36 ``` -- To run int8 inference on native baremetal(outside graphene) +- To run int8 inference on native baremetal (outside Graphene) ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \ models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ @@ -95,18 +90,20 @@ models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ --intra_op_parallelism_threads=36 ``` - Above commands are for a 36 core system. Please set the following options accordingly for optimal - performance. - - OMP_NUM_THREADS='Core(s) per socket', OMP_NUM_THREADS sets the maximum number of threads to - use for OpenMP parallel regions. - - taskset to 'Core(s) per socket' - - intra_op_parallelism_threads='Core(s) per socket' - - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact`` - - KMP_AFFINITY binds OpenMP threads to physical processing units. ->**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + performance: + - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X` + and `intra_op_parallelism_threads=X`. + - Specify the whole range of cores available on one of the sockets in `taskset`. + - If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` + - If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` + - Note that `OMP_NUM_THREADS` sets the maximum number of threads to + use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads + to physical processing units. + +**NOTE:** To get number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. ## Run inference on ResNet50 model -- To run inference on graphene-sgx(SGX version) +- To run inference on graphene-sgx (SGX version) ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \ ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ @@ -117,7 +114,7 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --warmup-steps=50 \ --steps=500 ``` -- To run inference on graphene-direct(non-SGX version) +- To run inference on graphene-direct (non-SGX version) ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct \ ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ @@ -128,7 +125,7 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --warmup-steps=50 \ --steps=500 ``` -- To run inference on native baremetal(outside graphene) +- To run inference on native baremetal (outside Graphene) ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \ models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ @@ -140,36 +137,40 @@ models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_cla --steps=500 ``` - Above commands are for a 36 core system. Please set the following options accordingly for optimal - performance. - - OMP_NUM_THREADS='Core(s) per socket', OMP_NUM_THREADS sets the maximum number of threads to - use for OpenMP parallel regions. - - taskset to 'Core(s) per socket' - - num-intra-threads='Core(s) per socket' - - If hyperthreading is enabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - - If hyperthreading is disabled : use ``KMP_AFFINITY=granularity=fine,verbose,compact`` - - KMP_AFFINITY binds OpenMP threads to physical processing units. + performance: + - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X` + and `num_intra_threads=X`. + - Specify the whole range of cores available on one of the sockets in `taskset`. + - If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` + - If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` + - Note that `OMP_NUM_THREADS` sets the maximum number of threads to + use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads + to physical processing units. - The options batch-size, warmup-steps and steps can be varied. ->**NOTE:** To get 'Core(s) per socket', do ``lscpu | grep 'Core(s) per socket'`` + +**NOTE:** To get number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. ## Performance considerations +- Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency +to achieve best performance or to save power based on the requirement. +To set the CPU frequency scaling governor to performance mode: + + - ``for ((i=0; i<$(nproc); i++)); do echo 'performance' > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done`` + - Preheat manifest option pre-faults the enclave memory and moves the performance penalty to graphene-sgx invocation (before the workload starts execution). -To use preheat option, add ``sgx.preheat_enclave = 1`` to the manifest template. +To use preheat option, add ``sgx.preheat_enclave = true`` to the manifest template. - TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help improve performance significantly based on the workloads. At any point, only one of these allocators can be used. - TCMalloc (Please update the binary location and name if different from default) - - Install tcmalloc : ``sudo apt-get install google-perftools`` + - Install tcmalloc: ``sudo apt-get install google-perftools`` - Add the following lines in the manifest template and rebuild the sample. -``` -loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" -sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" -sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8" -``` + - ``loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"`` + - ``sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"`` + - ``sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"`` - mimalloc (Please update the binary location and name if different from default) - Install mimalloc using the steps from https://github.com/microsoft/mimalloc - Add the following lines in the manifest template and rebuild the sample. -``` -loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7" -sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7" -``` + - ``loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"`` + - ``sgx.trusted_files.libmimalloc = "file:/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"`` diff --git a/Examples/tensorflow/ResNet50/python.manifest.template b/Examples/tensorflow/ResNet50/python.manifest.template index 857023154b..d5502f2045 100755 --- a/Examples/tensorflow/ResNet50/python.manifest.template +++ b/Examples/tensorflow/ResNet50/python.manifest.template @@ -1,5 +1,3 @@ -# This manifest was tested on Ubuntu 18.04 with python3.6. - libos.entrypoint = "{{ entrypoint }}" loader.preload = "file:{{ graphene.libos }}" @@ -7,13 +5,13 @@ loader.preload = "file:{{ graphene.libos }}" loader.log_level = "{{ log_level }}" # Read application arguments directly from the command line. Don't use this on production! -loader.insecure__use_cmdline_argv = 1 +loader.insecure__use_cmdline_argv = true # Propagate environment variables from the host. Don't use this on production! -loader.insecure__use_host_env = 1 +loader.insecure__use_host_env = true # Disable address space layout randomization. Don't use this on production! -loader.insecure__disable_aslr = 1 +loader.insecure__disable_aslr = true # Update Library Path - overwrites environment variable loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" @@ -62,8 +60,8 @@ fs.mount.etc.uri = "file:/etc" # SGX general options sgx.enclave_size = "32G" sgx.thread_num = 300 -sgx.preheat_enclave = 1 -sgx.nonpie_binary = 1 +sgx.preheat_enclave = true +sgx.nonpie_binary = true # SGX trusted files sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" From e4488412e107e0db6c8472cc6a464ef30884d8e2 Mon Sep 17 00:00:00 2001 From: Satya Date: Mon, 26 Jul 2021 03:00:57 -0400 Subject: [PATCH 6/8] fixup! fixup! fixup! fixup! Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/BERT/.gitignore | 3 ++ Examples/tensorflow/README.md | 71 ++++--------------------- Examples/tensorflow/ResNet50/.gitignore | 2 + 3 files changed, 16 insertions(+), 60 deletions(-) create mode 100644 Examples/tensorflow/BERT/.gitignore create mode 100644 Examples/tensorflow/ResNet50/.gitignore diff --git a/Examples/tensorflow/BERT/.gitignore b/Examples/tensorflow/BERT/.gitignore new file mode 100644 index 0000000000..ca0b8b8d11 --- /dev/null +++ b/Examples/tensorflow/BERT/.gitignore @@ -0,0 +1,3 @@ +/models/ +/data/ +/output/ diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md index 93ea39a508..5112301741 100755 --- a/Examples/tensorflow/README.md +++ b/Examples/tensorflow/README.md @@ -52,43 +52,11 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --inter_op_parallelism_threads=1 \ --intra_op_parallelism_threads=36 ``` -- To run int8 inference on graphene-direct (non-SGX version) -``` -OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 \ -graphene-direct ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ ---init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \ ---vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \ ---bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \ ---predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \ ---precision=int8 \ ---output_dir=output/bert-squad-output \ ---predict_batch_size=32 \ ---experimental_gelu=True \ ---optimized_softmax=True \ ---input_graph=data/asymmetric_per_channel_bert_int8.pb \ ---do_predict=True \ ---mode=benchmark \ ---inter_op_parallelism_threads=1 \ ---intra_op_parallelism_threads=36 -``` -- To run int8 inference on native baremetal (outside Graphene) -``` -OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \ -models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ ---init_checkpoint=data/bert_large_checkpoints/model.ckpt-3649 \ ---vocab_file=data/wwm_uncased_L-24_H-1024_A-16/vocab.txt \ ---bert_config_file=data/wwm_uncased_L-24_H-1024_A-16/bert_config.json \ ---predict_file=data/wwm_uncased_L-24_H-1024_A-16/dev-v1.1.json \ ---precision=int8 --output_dir=output/bert-squad-output \ ---predict_batch_size=32 \ ---experimental_gelu=True \ ---optimized_softmax=True \ ---input_graph=data/asymmetric_per_channel_bert_int8.pb \ ---do_predict=True \ ---mode=benchmark \ ---inter_op_parallelism_threads=1 \ ---intra_op_parallelism_threads=36 -``` +- To run int8 inference on graphene-direct (non-SGX version), replace `graphene-sgx` with +`graphene-direct` in the above command. +- To run int8 inference on native baremetal (outside Graphene), replace `graphene-sgx ./python` with +`python3` in the above command. + - Above commands are for a 36 core system. Please set the following options accordingly for optimal performance: - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X` @@ -114,28 +82,11 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --warmup-steps=50 \ --steps=500 ``` -- To run inference on graphene-direct (non-SGX version) -``` -OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-direct \ -./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ ---input-graph=resnet50v1_5_int8_pretrained_model.pb \ ---num-inter-threads=1 \ ---num-intra-threads=36 \ ---batch-size=32 \ ---warmup-steps=50 \ ---steps=500 -``` -- To run inference on native baremetal (outside Graphene) -``` -OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 python3.6 \ -models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ ---input-graph=resnet50v1_5_int8_pretrained_model.pb \ ---num-inter-threads=1 \ ---num-intra-threads=36 \ ---batch-size=32 \ ---warmup-steps=50 \ ---steps=500 -``` +- To run inference on graphene-direct (non-SGX version), replace `graphene-sgx` with +`graphene-direct` in the above command. +- To run inference on native baremetal (outside Graphene), replace `graphene-sgx ./python` with +`python3` in the above command. + - Above commands are for a 36 core system. Please set the following options accordingly for optimal performance: - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X` @@ -146,7 +97,7 @@ models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_cla - Note that `OMP_NUM_THREADS` sets the maximum number of threads to use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads to physical processing units. - - The options batch-size, warmup-steps and steps can be varied. + - The options `batch-size`, `warmup-steps` and `steps` can be varied. **NOTE:** To get number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. diff --git a/Examples/tensorflow/ResNet50/.gitignore b/Examples/tensorflow/ResNet50/.gitignore new file mode 100644 index 0000000000..e3e7bc828d --- /dev/null +++ b/Examples/tensorflow/ResNet50/.gitignore @@ -0,0 +1,2 @@ +/models/ +/resnet50v1_5_int8_pretrained_model.pb From a4ea7d9851ace8bc280f1e4cd45a4af38e36dc49 Mon Sep 17 00:00:00 2001 From: Satya Date: Wed, 4 Aug 2021 23:15:30 +0530 Subject: [PATCH 7/8] fixup! fixup! fixup! fixup! fixup! Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/BERT/Makefile | 8 +-- .../tensorflow/BERT/python.manifest.template | 13 ---- Examples/tensorflow/README.md | 64 ++++++++----------- Examples/tensorflow/ResNet50/Makefile | 8 +-- .../ResNet50/python.manifest.template | 13 ---- 5 files changed, 33 insertions(+), 73 deletions(-) diff --git a/Examples/tensorflow/BERT/Makefile b/Examples/tensorflow/BERT/Makefile index a50bd4a050..f2a029652e 100755 --- a/Examples/tensorflow/BERT/Makefile +++ b/Examples/tensorflow/BERT/Makefile @@ -1,9 +1,7 @@ # BERT sample for Tensorflow -GRAPHENEDIR ?= ../../.. -SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem - -include $(GRAPHENEDIR)/Scripts/Makefile.configs +ARCH_LIBDIR ?= /lib/$(shell $(CC) -dumpmachine) +SGX_SIGNER_KEY ?= ../../../Pal/src/host/Linux-SGX/signer/enclave-key.pem ifeq ($(DEBUG),1) GRAPHENE_LOG_LEVEL = debug @@ -42,6 +40,8 @@ python.manifest: python.manifest.template collateral $< >$@ python.manifest.sgx: python.manifest + @test -s $(SGX_SIGNER_KEY) || \ + { echo "SGX signer private key was not found, please specify SGX_SIGNER_KEY!"; exit 1; } graphene-sgx-sign \ --key $(SGX_SIGNER_KEY) \ --manifest $< -output $@ diff --git a/Examples/tensorflow/BERT/python.manifest.template b/Examples/tensorflow/BERT/python.manifest.template index 350de0079a..a679942efd 100755 --- a/Examples/tensorflow/BERT/python.manifest.template +++ b/Examples/tensorflow/BERT/python.manifest.template @@ -1,30 +1,20 @@ libos.entrypoint = "{{ entrypoint }}" loader.preload = "file:{{ graphene.libos }}" -# Graphene log level loader.log_level = "{{ log_level }}" -# Read application arguments directly from the command line. Don't use this on production! loader.insecure__use_cmdline_argv = true - -# Propagate environment variables from the host. Don't use this on production! loader.insecure__use_host_env = true - -# Disable address space layout randomization. Don't use this on production! loader.insecure__disable_aslr = true -# Update Library Path - overwrites environment variable loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" -# Additional memory for Graphene's internal use loader.pal_internal_mem_size = "512M" -# Default glibc files, mounted from graphene's Runtime directory fs.mount.lib.type = "chroot" fs.mount.lib.path = "/lib" fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}" -# More libraries required by Tensorflow fs.mount.lib2.type = "chroot" fs.mount.lib2.path = "{{ arch_libdir }}" fs.mount.lib2.uri = "file:{{ arch_libdir }}" @@ -53,13 +43,11 @@ fs.mount.etc.type = "chroot" fs.mount.etc.path = "/etc" fs.mount.etc.uri = "file:/etc" -# SGX general options sgx.enclave_size = "32G" sgx.thread_num = 256 sgx.preheat_enclave = true sgx.nonpie_binary = true -# SGX trusted files sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/" sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" @@ -68,7 +56,6 @@ sgx.trusted_files.pyhome = "file:{{ python.stdlib }}" sgx.trusted_files.pydisthome = "file:{{ python.distlib }}" sgx.trusted_files.pydistpath = "file:{{ pythondistpath }}" -# SGX allowed files sgx.allowed_files.tmp = "file:/tmp/" sgx.allowed_files.etc = "file:/etc/" sgx.allowed_files.output = "file:output/" diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md index 5112301741..d0fae9d22c 100755 --- a/Examples/tensorflow/README.md +++ b/Examples/tensorflow/README.md @@ -5,36 +5,35 @@ inference. ### Bidirectional Encoder Representations from Transformers (BERT): BERT is a method of pre-training language representations and then use that trained model for -downstream NLP tasks like 'question answering'. BERT is an unsupervised, deeply birectional system +downstream NLP tasks like 'question answering'. BERT is an unsupervised, deeply bidirectional system for pre-training NLP. -In this BERT sample, we use 'BERT-Large, Uncased (Whole Word Masking)' model and perform int8 +In this BERT sample, we use **BERT-Large, Uncased (Whole Word Masking)** model and perform int8 inference. More details about BERT can be found at https://github.com/google-research/bert. ### Residual Network (ResNet): ResNet50 is a convolutional neural network that is 50 layers deep. -In this ResNet50(v1.5) sample, we use a pre-trained model and perform int8 inference. +In this ResNet50 (v1.5) sample, we use a pre-trained model and perform int8 inference. More details about ResNet50 can be found at https://github.com/IntelAI/models/tree/icx-launch-public/benchmarks/image_recognition/tensorflow/resnet50v1_5. ## Pre-requisites - Upgrade pip/pip3. -- Install tensorflow using ``pip install intel-tensorflow-avx512==2.4.0`` or by downloading whl -package from https://pypi.org/project/intel-tensorflow-avx512/2.4.0/#files. +- Install TensorFlow using ``pip install intel-tensorflow-avx512==2.4.0``. ## Build BERT or ResNet50 samples -- To build BERT sample, do ``cd BERT`` or to build ResNet50 sample, do ``cd ResNet50``. +- To build BERT sample, do ``cd BERT``. To build ResNet50 sample, do ``cd ResNet50``. - To clean the sample, do ``make clean`` - To clean and remove downloaded models and datasets, do ``make distclean`` - To build the non-SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/`` - To build the SGX version, do ``make PYTHONDISTPATH=path_to_python_dist_packages/ SGX=1`` -- Typically, path_to_python_dist_packages is '/usr/local/lib/python3.6/dist-packages', but can +- Typically, ``path_to_python_dist_packages`` is ``/usr/local/lib/python3.6/dist-packages``, but can change based on python's installation directory. -- Keras settings are configured in the file root/.keras/keras.json. It is configured to use -tensorflow as backend. +- Keras settings are configured in the file ``root/.keras/keras.json``. It is configured to use +TensorFlow as backend. **WARNING:** Building BERT sample downloads about 5GB of data. ## Run inference on BERT model -- To run int8 inference on graphene-sgx (SGX version) +- To run int8 inference on ``graphene-sgx`` (SGX version): ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \ ./python models/models/language_modeling/tensorflow/bert_large/inference/run_squad.py \ @@ -52,26 +51,13 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --inter_op_parallelism_threads=1 \ --intra_op_parallelism_threads=36 ``` -- To run int8 inference on graphene-direct (non-SGX version), replace `graphene-sgx` with -`graphene-direct` in the above command. -- To run int8 inference on native baremetal (outside Graphene), replace `graphene-sgx ./python` with -`python3` in the above command. - -- Above commands are for a 36 core system. Please set the following options accordingly for optimal - performance: - - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X` - and `intra_op_parallelism_threads=X`. - - Specify the whole range of cores available on one of the sockets in `taskset`. - - If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - - If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` - - Note that `OMP_NUM_THREADS` sets the maximum number of threads to - use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads - to physical processing units. - -**NOTE:** To get number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. +- To run int8 inference on ``graphene-direct`` (non-SGX version), replace ``graphene-sgx`` with +``graphene-direct`` in the above command. +- To run int8 inference on native baremetal (outside Graphene), replace ``graphene-sgx ./python`` with +``python3`` in the above command. ## Run inference on ResNet50 model -- To run inference on graphene-sgx (SGX version) +- To run inference on ``graphene-sgx`` (SGX version): ``` OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c 0-35 graphene-sgx \ ./python models/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py \ @@ -82,24 +68,24 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c --warmup-steps=50 \ --steps=500 ``` -- To run inference on graphene-direct (non-SGX version), replace `graphene-sgx` with -`graphene-direct` in the above command. -- To run inference on native baremetal (outside Graphene), replace `graphene-sgx ./python` with -`python3` in the above command. +- To run inference on ``graphene-direct`` (non-SGX version), replace ``graphene-sgx`` with +``graphene-direct`` in the above command. +- To run inference on native baremetal (outside Graphene), replace ``graphene-sgx ./python`` with +``python3`` in the above command. +## Notes on optimal performance - Above commands are for a 36 core system. Please set the following options accordingly for optimal performance: - - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X` - and `num_intra_threads=X`. + - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X`, + `intra_op_parallelism_threads=X` for BERT and `num_intra_threads=X` for ResNet50. - Specify the whole range of cores available on one of the sockets in `taskset`. - If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` - Note that `OMP_NUM_THREADS` sets the maximum number of threads to use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads to physical processing units. - - The options `batch-size`, `warmup-steps` and `steps` can be varied. - -**NOTE:** To get number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. + - The options `batch-size`, `warmup-steps` and `steps` can be varied for ResNet50 sample. + - To get the number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. ## Performance considerations - Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency @@ -114,13 +100,13 @@ To use preheat option, add ``sgx.preheat_enclave = true`` to the manifest templa - TCMalloc and mimalloc are memory allocator libraries from Google and Microsoft that can help improve performance significantly based on the workloads. At any point, only one of these allocators can be used. - - TCMalloc (Please update the binary location and name if different from default) + - TCMalloc (Please update the binary location and name if different from default): - Install tcmalloc: ``sudo apt-get install google-perftools`` - Add the following lines in the manifest template and rebuild the sample. - ``loader.env.LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"`` - ``sgx.trusted_files.libtcmalloc = "file:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"`` - ``sgx.trusted_files.libunwind = "file:/usr/lib/x86_64-linux-gnu/libunwind.so.8"`` - - mimalloc (Please update the binary location and name if different from default) + - mimalloc (Please update the binary location and name if different from default): - Install mimalloc using the steps from https://github.com/microsoft/mimalloc - Add the following lines in the manifest template and rebuild the sample. - ``loader.env.LD_PRELOAD = "/usr/local/lib/mimalloc-1.7/libmimalloc.so.1.7"`` diff --git a/Examples/tensorflow/ResNet50/Makefile b/Examples/tensorflow/ResNet50/Makefile index 035a262de5..a8d1392532 100755 --- a/Examples/tensorflow/ResNet50/Makefile +++ b/Examples/tensorflow/ResNet50/Makefile @@ -1,9 +1,7 @@ # ResNet50 sample for Tensorflow -GRAPHENEDIR ?= ../../.. -SGX_SIGNER_KEY ?= $(GRAPHENEDIR)/Pal/src/host/Linux-SGX/signer/enclave-key.pem - -include $(GRAPHENEDIR)/Scripts/Makefile.configs +ARCH_LIBDIR ?= /lib/$(shell $(CC) -dumpmachine) +SGX_SIGNER_KEY ?= ../../../Pal/src/host/Linux-SGX/signer/enclave-key.pem ifeq ($(DEBUG),1) GRAPHENE_LOG_LEVEL = debug @@ -30,6 +28,8 @@ python.manifest: python.manifest.template collateral $< >$@ python.manifest.sgx: python.manifest + @test -s $(SGX_SIGNER_KEY) || \ + { echo "SGX signer private key was not found, please specify SGX_SIGNER_KEY!"; exit 1; } graphene-sgx-sign \ --key $(SGX_SIGNER_KEY) \ --manifest python.manifest \ diff --git a/Examples/tensorflow/ResNet50/python.manifest.template b/Examples/tensorflow/ResNet50/python.manifest.template index d5502f2045..a9919460b8 100755 --- a/Examples/tensorflow/ResNet50/python.manifest.template +++ b/Examples/tensorflow/ResNet50/python.manifest.template @@ -1,30 +1,20 @@ libos.entrypoint = "{{ entrypoint }}" loader.preload = "file:{{ graphene.libos }}" -# Graphene log level loader.log_level = "{{ log_level }}" -# Read application arguments directly from the command line. Don't use this on production! loader.insecure__use_cmdline_argv = true - -# Propagate environment variables from the host. Don't use this on production! loader.insecure__use_host_env = true - -# Disable address space layout randomization. Don't use this on production! loader.insecure__disable_aslr = true -# Update Library Path - overwrites environment variable loader.env.LD_LIBRARY_PATH = "{{ python.stdlib }}/lib:/lib:{{ arch_libdir }}:/usr/lib:/usr/{{ arch_libdir }}" -# Additional memory for Graphene's internal use loader.pal_internal_mem_size = "512M" -# Default glibc files, mounted from graphene's Runtime directory fs.mount.lib.type = "chroot" fs.mount.lib.path = "/lib" fs.mount.lib.uri = "file:{{ graphene.runtimedir() }}" -# More libraries required by Tensorflow fs.mount.lib2.type = "chroot" fs.mount.lib2.path = "{{ arch_libdir }}" fs.mount.lib2.uri = "file:{{ arch_libdir }}" @@ -57,13 +47,11 @@ fs.mount.etc.type = "chroot" fs.mount.etc.path = "/etc" fs.mount.etc.uri = "file:/etc" -# SGX general options sgx.enclave_size = "32G" sgx.thread_num = 300 sgx.preheat_enclave = true sgx.nonpie_binary = true -# SGX trusted files sgx.trusted_files.runtime = "file:{{ graphene.runtimedir() }}/" sgx.trusted_files.arch_libdir = "file:{{ arch_libdir }}/" sgx.trusted_files.usr_arch_libdir = "file:/usr/{{ arch_libdir }}/" @@ -73,7 +61,6 @@ sgx.trusted_files.pyhome = "file:{{ python.stdlib }}" sgx.trusted_files.pydisthome = "file:{{ python.distlib }}" sgx.trusted_files.pydistpath = "file:{{ pythondistpath }}" -# SGX allowed files sgx.allowed_files.tmp = "file:/tmp/" sgx.allowed_files.etc = "file:/etc/" sgx.allowed_files.proc = "file:/proc/" From 486adb01222c14416c294bab7644f418b4a836da Mon Sep 17 00:00:00 2001 From: Satya Date: Sat, 7 Aug 2021 00:16:44 +0530 Subject: [PATCH 8/8] fixup! fixup! fixup! fixup! fixup! fixup! Add TensorFlow examples - ResNet50 and BERT models Signed-off-by: Satya --- Examples/tensorflow/README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/Examples/tensorflow/README.md b/Examples/tensorflow/README.md index d0fae9d22c..c6acfdd996 100755 --- a/Examples/tensorflow/README.md +++ b/Examples/tensorflow/README.md @@ -74,18 +74,19 @@ OMP_NUM_THREADS=36 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 taskset -c ``python3`` in the above command. ## Notes on optimal performance -- Above commands are for a 36 core system. Please set the following options accordingly for optimal - performance: - - Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X`, - `intra_op_parallelism_threads=X` for BERT and `num_intra_threads=X` for ResNet50. - - Specify the whole range of cores available on one of the sockets in `taskset`. - - If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` - - If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` - - Note that `OMP_NUM_THREADS` sets the maximum number of threads to - use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads - to physical processing units. - - The options `batch-size`, `warmup-steps` and `steps` can be varied for ResNet50 sample. - - To get the number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. +Above commands are for a 36 core system. Please set the following options accordingly for optimal +performance: + +- Assuming that X is the number of cores per socket, set `OMP_NUM_THREADS=X`, + `intra_op_parallelism_threads=X` for BERT and `num_intra_threads=X` for ResNet50. +- Specify the whole range of cores available on one of the sockets in `taskset`. +- If hyperthreading is enabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact,1,0`` +- If hyperthreading is disabled: use ``KMP_AFFINITY=granularity=fine,verbose,compact`` +- Note that `OMP_NUM_THREADS` sets the maximum number of threads to + use for OpenMP parallel regions, and `KMP_AFFINITY` binds OpenMP threads + to physical processing units. +- The options `batch-size`, `warmup-steps` and `steps` can be varied for ResNet50 sample. +- To get the number of cores per socket, do ``lscpu | grep 'Core(s) per socket'``. ## Performance considerations - Linux systems have CPU frequency scaling governor that helps the system to scale the CPU frequency