Configuration

- name: <string>  # name of the API (required)
  kind: RealtimeAPI  # must be "RealtimeAPI" for realtime APIs (required)
  pod:  # pod configuration (required)
    port: <int>  # port to which requests will be sent (default: 8080; exported as $CORTEX_PORT)
    max_concurrency: <int>  # maximum number of requests that will be concurrently sent into the container (default: 1)
    max_queue_length: <int>  # maximum number of requests per replica which will be queued (beyond max_concurrency) before requests are rejected with error code 503 (default: 100)
    containers:  # configurations for the containers to run (at least one constainer must be provided)
      - name: <string>  # name of the container (required)
        image: <string>  # docker image to use for the container (required)
        command: <list[string]>  # entrypoint (not executed within a shell); env vars can be used with e.g. $(CORTEX_PORT) (default: the docker image's ENTRYPOINT)
        args: <list[string]>  # arguments to the entrypoint; env vars can be used with e.g. $(CORTEX_PORT) (default: the docker image's CMD)
        env: <map[string:string]>  # dictionary of environment variables to set in the container (optional)
        compute:  # compute resource requests (default: see below)
          cpu: <string|int|float>  # CPU request for the container; one unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
          gpu: <int>  # GPU request for the container; one unit of GPU corresponds to one virtual GPU (default: 0)
          inf: <int>  # Inferentia request for the container; one unit of inf corresponds to one virtual Inferentia chip (default: 0)
          mem: <string>  # memory request for the container; one unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
          shm: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
        readiness_probe:  # periodic probe of container readiness; traffic will not be sent into the pod unless all containers' readiness probes are succeeding (optional)
          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
            port: <int|string>  # the port to access on the container (required)
            path: <string>  # the path to access on the HTTP server (default: /)
          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
            port: <int|string>  # the port to access on the container (required)
          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
            command: <list[string]>  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
        liveness_probe:  # periodic probe of container liveness; container will be restarted if the probe fails (optional)
          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
            port: <int|string>  # the port to access on the container (required)
            path: <string>  # the path to access on the HTTP server (default: /)
          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
            port: <int|string>  # the port to access on the container (required)
          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
            command: <list[string]>  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
        pre_stop:  # a pre-stop lifecycle hook for the container; will be executed before container termination (optional)
          http_get:  # specifies an http endpoint to send a request to (only one of http_get, tcp_socket, and exec may be specified)
            port: <int|string>  # the port to access on the container (required)
            path: <string>  # the path to access on the HTTP server (default: /)
          exec:  # specifies a command to run (only one of http_get, tcp_socket, and exec may be specified)
            command: <list[string]>  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
  autoscaling:  # autoscaling configuration (default: see below)
    min_replicas: <int>  # minimum number of replicas (default: 1)
    max_replicas: <int>  # maximum number of replicas (default: 100)
    init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
    target_in_flight: <float>  # desired number of in-flight requests per replica (including requests actively being processed as well as queued), which the autoscaler tries to maintain (default: <max_concurrency>)
    window: <duration>  # duration over which to average the API's in-flight requests per replica (default: 60s)
    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
    max_downscale_factor: <float>  # maximum factor by which to scale down the API on a single scaling event (default: 0.75)
    max_upscale_factor: <float>  # maximum factor by which to scale up the API on a single scaling event (default: 1.5)
    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale-up event (default: 0.05)
  node_groups: <list[string]>  # a list of node groups on which this API can run (default: all node groups are eligible)
  update_strategy:  # deployment strategy to use when replacing existing replicas with new ones (default: see below)
    max_surge: <string|int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
    max_unavailable: <string|int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
  networking:  # networking configuration (default: see below)
    endpoint: <string>  # endpoint for the API (default: <api_name>)

Last updated