Configuration

- name: <string>
  kind: AsyncAPI
  handler: # detailed configuration below
  compute: # detailed configuration below
  autoscaling: # detailed configuration below
  update_strategy: # detailed configuration below
  networking: # detailed configuration below

Handler

Python Handler

handler:
  type: python
  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
  dependencies: # (optional)
    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (optional)
  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
  image: <string>  # docker image to use for the handler (default: quay.io/cortexlabs/python-handler-cpu:0.34.0, quay.io/cortexlabs/python-handler-gpu:0.34.0-cuda10.2-cudnn8, or quay.io/cortexlabs/python-handler-inf:0.34.0 based on compute)
  env: <string: string>  # dictionary of environment variables
  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)

Tensorflow Handler

handler:
  type: tensorflow
  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
  dependencies: # (optional)
    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
  models:  # (required)
    path: <string> # S3 path to an exported SavedModel directory (e.g. s3://my-bucket/exported_model/) (either this, 'dir', or 'paths' must be provided)
    paths:  # list of S3 paths to exported SavedModel directories (either this, 'dir', or 'path' must be provided)
      - name: <string>  # unique name for the model (e.g. text-generator) (required)
        path: <string>  # S3 path to an exported SavedModel directory (e.g. s3://my-bucket/exported_model/) (required)
        signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
      ...
    dir: <string>  # S3 path to a directory containing multiple SavedModel directories (e.g. s3://my-bucket/models/) (either this, 'path', or 'paths' must be provided)
    signature_key:  # name of the signature def to use for prediction (required if your model has more than one signature def)
  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (optional)
  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
  image: <string>  # docker image to use for the handler (default: quay.io/cortexlabs/tensorflow-handler:0.34.0)
  tensorflow_serving_image: <string>  # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-cpu:0.34.0, quay.io/cortexlabs/tensorflow-serving-gpu:0.34.0, or quay.io/cortexlabs/tensorflow-serving-inf:0.34.0 based on compute)
  env: <string: string>  # dictionary of environment variables
  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)

Compute

compute:
  cpu: <string | int | float>  # CPU request per replica. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
  gpu: <int>  # GPU request per replica. One unit of GPU corresponds to one virtual GPU (default: 0)
  inf: <int>  # Inferentia request per replica. One unit of Inf corresponds to one virtual Inferentia chip (default: 0)
  mem: <string>  # memory request per replica. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
  node_groups: <list:string>  # to select specific node groups (optional)

Autoscaling

autoscaling:
  min_replicas: <int>  # minimum number of replicas (default: 1)
  max_replicas: <int>  # maximum number of replicas (default: 100)
  init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
  max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
  target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
  window: <duration>  # the time over which to average the API's concurrency (default: 60s)
  downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
  upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
  max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
  max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
  downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
  upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)

Update strategy

update_strategy:
  max_surge: <string | int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
  max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)

Networking

  networking:
    endpoint: <string>  # the endpoint for the API (default: <api_name>)

PreviousHandler NextTensorFlow Models

Last updated 3 years ago