1#!/bin/bash 2# Copyright 2019 The TensorFlow Authors. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# ============================================================================== 16# Reusable functions for using CTPU in a Kokoro build. 17# These functions are unit tested in ctpu_test.sh. 18 19# Installs the Cloud TPU CLI to the current directory. 20# Pass pip command as first arg, ex: install_ctpu pip3.7 21function install_ctpu { 22 PIP_CMD="${1:-pip}" 23 24 # TPUClusterResolver has a runtime dependency cloud-tpu-client when 25 # resolving a Cloud TPU. It's very likely we want this installed if we're 26 # using CTPU. 27 # Replace cloud-tpu-client with google-api-python-client oauth2client to test 28 # the client at head. 29 "${PIP_CMD}" install --user --upgrade --ignore-installed cloud-tpu-client 30 31 wget -nv "https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu" 32 chmod a+x ctpu 33} 34 35# Starts a Cloud TPU, storing metadata into artifacts dir for export. 36# 37# This function supports overriding the default parameters, using optional 38# single-letter flags. 39# 40# Usage: 41# ctpu_up -n [tpu name] -z [zone] -s [tpu size] -v [tf-version] \ 42# -p [cloud project] -g [gcp-network] 43function ctpu_up { 44 local OPTIND o # Used for flag parsing 45 # Generate a unique random name for TPU, as we might be running multiple builds in parallel. 46 local name="kokoro-tpu-${RANDOM}" 47 local zone="us-central1-c" 48 local size="v2-8" 49 local version="nightly" 50 local project # Project automatically detected from environment. 51 local gcp_network # Network needed only if project default is Legacy. 52 53 # Override any of the above params from flags. 54 while getopts ":n:z:p:s:v:g:" o; do 55 case "${o}" in 56 n) 57 name="${OPTARG}" 58 ;; 59 z) 60 zone="${OPTARG}" 61 ;; 62 p) 63 project="${OPTARG}" 64 ;; 65 s) 66 size="${OPTARG}" 67 ;; 68 v) 69 version="${OPTARG}" 70 ;; 71 g) 72 gcp_network="${OPTARG}" 73 ;; 74 *) 75 echo "Unexpected parameter for ctpu_up: ${o}" 76 exit 1 77 esac 78 done 79 shift $((OPTIND-1)) 80 81 export TPU_NAME="${name}" 82 export TPU_ZONE="${zone}" 83 84 # Store name and zone into artifacts dir so cleanup job has access. 85 echo "${TPU_NAME}" > "${TF_ARTIFACTS_DIR}/tpu_name" 86 echo "${TPU_ZONE}" > "${TF_ARTIFACTS_DIR}/tpu_zone" 87 88 local args=( 89 "--zone=${zone}" 90 "--tf-version=${version}" 91 "--name=${name}" 92 "--tpu-size=${size}" 93 "--tpu-only" 94 "-noconf" 95 ) 96 97 # "-v" is a bash 4.2 builtin for checking that a variable is set. 98 if [[ -v gcp_network ]]; then 99 args+=("--gcp-network=${gcp_network}") 100 fi 101 102 if [[ -v project ]]; then 103 args+=("--project=${project}") 104 export TPU_PROJECT="${project}" 105 echo "${project}" > "${TF_ARTIFACTS_DIR}/tpu_project" 106 fi 107 108 ./ctpu up "${args[@]}" 109} 110 111# Delete the Cloud TPU specified by the metadata in the gfile directory. 112function ctpu_delete { 113 export TPU_NAME="$(cat "${TF_GFILE_DIR}/tpu_name")" 114 export TPU_ZONE="$(cat "${TF_GFILE_DIR}/tpu_zone")" 115 TPU_PROJECT_FILE="${TF_GFILE_DIR}/tpu_project" 116 if [ -f "${TPU_PROJECT_FILE}" ]; then 117 export TPU_PROJECT="$(cat ${TPU_PROJECT_FILE})" 118 else 119 export TPU_PROJECT="tensorflow-testing" 120 fi 121 122 # Retry due to rare race condition where TPU creation hasn't propagated by 123 # the time we try to delete it. 124 for i in 1 2 3; do 125 ./ctpu delete \ 126 --project=${TPU_PROJECT} \ 127 --zone="${TPU_ZONE}" \ 128 --name="${TPU_NAME}" \ 129 --tpu-only \ 130 -noconf && return 0 || sleep 60 131 done 132 return 1 133} 134