xref: /aosp_15_r20/external/tensorflow/tensorflow/tools/ci_build/ctpu/ctpu.sh (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1#!/bin/bash
2# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15# ==============================================================================
16# Reusable functions for using CTPU in a Kokoro build.
17# These functions are unit tested in ctpu_test.sh.
18
19# Installs the Cloud TPU CLI to the current directory.
20# Pass pip command as first arg, ex: install_ctpu pip3.7
21function install_ctpu {
22  PIP_CMD="${1:-pip}"
23
24  # TPUClusterResolver has a runtime dependency cloud-tpu-client when
25  # resolving a Cloud TPU. It's very likely we want this installed if we're
26  # using CTPU.
27  # Replace cloud-tpu-client with google-api-python-client oauth2client to test
28  # the client at head.
29  "${PIP_CMD}" install --user --upgrade --ignore-installed cloud-tpu-client
30
31  wget -nv "https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu"
32  chmod a+x ctpu
33}
34
35# Starts a Cloud TPU, storing metadata into artifacts dir for export.
36#
37# This function supports overriding the default parameters, using optional
38# single-letter flags.
39#
40# Usage:
41#   ctpu_up -n [tpu name] -z [zone] -s [tpu size] -v [tf-version] \
42#     -p [cloud project] -g [gcp-network]
43function ctpu_up {
44  local OPTIND o  # Used for flag parsing
45  # Generate a unique random name for TPU, as we might be running multiple builds in parallel.
46  local name="kokoro-tpu-${RANDOM}"
47  local zone="us-central1-c"
48  local size="v2-8"
49  local version="nightly"
50  local project  # Project automatically detected from environment.
51  local gcp_network  # Network needed only if project default is Legacy.
52
53  # Override any of the above params from flags.
54  while getopts ":n:z:p:s:v:g:" o; do
55    case "${o}" in
56      n)
57        name="${OPTARG}"
58        ;;
59      z)
60        zone="${OPTARG}"
61        ;;
62      p)
63        project="${OPTARG}"
64        ;;
65      s)
66        size="${OPTARG}"
67        ;;
68      v)
69        version="${OPTARG}"
70        ;;
71      g)
72        gcp_network="${OPTARG}"
73        ;;
74      *)
75        echo "Unexpected parameter for ctpu_up: ${o}"
76        exit 1
77    esac
78  done
79  shift $((OPTIND-1))
80
81  export TPU_NAME="${name}"
82  export TPU_ZONE="${zone}"
83
84  # Store name and zone into artifacts dir so cleanup job has access.
85  echo "${TPU_NAME}" > "${TF_ARTIFACTS_DIR}/tpu_name"
86  echo "${TPU_ZONE}" > "${TF_ARTIFACTS_DIR}/tpu_zone"
87
88  local args=(
89    "--zone=${zone}"
90    "--tf-version=${version}"
91    "--name=${name}"
92    "--tpu-size=${size}"
93    "--tpu-only"
94    "-noconf"
95  )
96
97  # "-v" is a bash 4.2 builtin for checking that a variable is set.
98  if [[ -v gcp_network ]]; then
99    args+=("--gcp-network=${gcp_network}")
100  fi
101
102  if [[ -v project ]]; then
103    args+=("--project=${project}")
104    export TPU_PROJECT="${project}"
105    echo "${project}" > "${TF_ARTIFACTS_DIR}/tpu_project"
106  fi
107
108  ./ctpu up "${args[@]}"
109}
110
111# Delete the Cloud TPU specified by the metadata in the gfile directory.
112function ctpu_delete {
113  export TPU_NAME="$(cat "${TF_GFILE_DIR}/tpu_name")"
114  export TPU_ZONE="$(cat "${TF_GFILE_DIR}/tpu_zone")"
115  TPU_PROJECT_FILE="${TF_GFILE_DIR}/tpu_project"
116  if [ -f "${TPU_PROJECT_FILE}" ]; then
117    export TPU_PROJECT="$(cat ${TPU_PROJECT_FILE})"
118  else
119    export TPU_PROJECT="tensorflow-testing"
120  fi
121
122  # Retry due to rare race condition where TPU creation hasn't propagated by
123  # the time we try to delete it.
124  for i in 1 2 3; do
125    ./ctpu delete \
126      --project=${TPU_PROJECT} \
127      --zone="${TPU_ZONE}" \
128      --name="${TPU_NAME}" \
129      --tpu-only \
130      -noconf && return 0 || sleep 60
131  done
132  return 1
133}
134