Skip to content
This repository was archived by the owner on Dec 4, 2024. It is now read-only.
This repository was archived by the owner on Dec 4, 2024. It is now read-only.

Zero downtime deployments result in 503s #657

@sgvandijk

Description

@sgvandijk

We are trying to use ZDDs, however these result in a brief period where old instances are set to maintenance in Haproxy while the new versions are still coming up, which results in 503 responses until the new version has come fully online.

Versions:
DC/OS: 1.12.0
Mesos: 1.7.1
Marathon: 1.7.174
Marathon-LB: 1.12.3 (also tested with 1.14.1)

At the time of deployment, dcos marathon app list --json gives:

  {
    "backoffFactor": 1.15,
    "backoffSeconds": 1,
    "constraints": [
      [
        "server_group",
        "IS",
        "PrivateAgentServerGroup"
      ]
    ],
    "container": {
      "docker": {
        "forcePullImage": false,
        "image": "<redacted>",
        "parameters": [
          {
            "key": "tty",
            "value": "true"
          },
          {
            "key": "log-driver",
            "value": "none"
          }
        ],
        "privileged": false
      },
      "portMappings": [
        {
          "containerPort": 80,
          "hostPort": 0,
          "labels": {},
          "protocol": "tcp",
          "servicePort": 10002
        }
      ],
      "type": "DOCKER",
      "volumes": []
    },
    "cpus": 0.5,
    "deployments": [],
    "disk": 0,
    "env": {
      "AWS_ACCESS_KEY_ID": "<redacted>",
      "AWS_SECRET_ACCESS_KEY": "<redacted>"
    },
    "executor": "",
    "gpus": 0,
    "healthChecks": [
      {
        "delaySeconds": 15,
        "gracePeriodSeconds": 300,
        "intervalSeconds": 60,
        "ipProtocol": "IPv4",
        "maxConsecutiveFailures": 3,
        "path": "/appname/health",
        "portIndex": 0,
        "protocol": "MESOS_HTTP",
        "timeoutSeconds": 20
      }
    ],
    "id": "/appname-blue",
    "instances": 1,
    "killSelection": "YOUNGEST_FIRST",
    "labels": {
      "HAPROXY_0_ENABLED": "true",
      "HAPROXY_0_PATH": "/appname",
      "HAPROXY_0_PORT": "10002",
      "HAPROXY_0_VHOST": "<redacted>",
      "HAPROXY_APP_ID": "appname",
      "HAPROXY_DEPLOYMENT_ALT_PORT": "10003",
      "HAPROXY_DEPLOYMENT_COLOR": "blue",
      "HAPROXY_DEPLOYMENT_GROUP": "appname",
      "HAPROXY_DEPLOYMENT_STARTED_AT": "2019-11-21T14:21:31.356274",
      "HAPROXY_DEPLOYMENT_TARGET_INSTANCES": "1",
      "HAPROXY_GROUP": "external"
    },
    "maxLaunchDelaySeconds": 3600,
    "mem": 2048,
    "networks": [
      {
        "mode": "container/bridge"
      }
    ],
    "requirePorts": false,
    "tasksHealthy": 1,
    "tasksRunning": 1,
    "tasksStaged": 0,
    "tasksUnhealthy": 0,
    "unreachableStrategy": {
      "expungeAfterSeconds": 0,
      "inactiveAfterSeconds": 0
    },
    "upgradeStrategy": {
      "maximumOverCapacity": 1,
      "minimumHealthCapacity": 1
    },
    "version": "2019-11-21T14:21:32.151Z",
    "versionInfo": {
      "lastConfigChangeAt": "2019-11-21T14:21:32.151Z",
      "lastScalingAt": "2019-11-21T14:21:32.151Z"
    }
  }

The new version gets deployed by posting the following configuration to Marathon:

{
  "id": "/appname-green",
  "cpus": 0.5,
  "mem": 2048,
  "instances": 1,
  "container": {
    "type": "DOCKER",
    "docker": {
      "image": "<redacted>",
      "network": "BRIDGE",
      "portMappings": [
        {
          "hostPort": 0,
          "containerPort": 80,
          "servicePort": 10003
        }
      ],
      "parameters": [
        {
          "key": "tty",
          "value": "true"
        },
        {
          "key": "log-driver",
          "value": "none"
        }
      ]
    }
  },
  "constraints": [
    [
      "server_group",
      "IS",
      "PrivateAgentServerGroup"
    ]
  ],
  "labels": {
    "HAPROXY_GROUP": "external",
    "HAPROXY_0_VHOST": "<redacted>",
    "HAPROXY_0_PATH": "/appname",
    "HAPROXY_DEPLOYMENT_ALT_PORT": "10003",
    "HAPROXY_0_ENABLED": "true",
    "HAPROXY_APP_ID": "appname",
    "HAPROXY_DEPLOYMENT_GROUP": "appname",
    "HAPROXY_DEPLOYMENT_COLOR": "green",
    "HAPROXY_0_PORT": "10002",
    "HAPROXY_DEPLOYMENT_TARGET_INSTANCES": "1",
    "HAPROXY_DEPLOYMENT_STARTED_AT": "2019-11-21T14:58:20.311986"
  },
  "env": {
    "AWS_ACCESS_KEY_ID": "<redacted>",
    "AWS_SECRET_ACCESS_KEY": "<redacted>"
  },
  "healthChecks": [
    {
      "gracePeriodSeconds": 300,
      "intervalSeconds": 60,
      "maxConsecutiveFailures": 3,
      "portIndex": 0,
      "timeoutSeconds": 20,
      "delaySeconds": 15,
      "protocol": "MESOS_HTTP",
      "path": "/appname/health",
      "ipProtocol": "IPv4"
    }
  ]
}

AFAICT this follows the logic in the README and zdd.py.

Checking the Haproxy stats directly after submission shows the previous, blue instance to be put directly into maintenance mode with the new, green instance still coming up:

image

If that takes a while, the new instance is marked as down:

image

During this time, until the green instance is up, trying to access the app returns 503 errors.

Happy to provide any more info if needed!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions