Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions aws/cloudformation/metaflow-cfn-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ Mappings:
ServiceName:
value: 'metadata-service-v2'
ImageUrl:
value: 'netflixoss/metaflow_metadata_service:v2.2.3'
value: 'netflixoss/metaflow_metadata_service:v2.3.5'
ContainerPort:
value: 8080
ContainerCpu:
Expand All @@ -113,7 +113,7 @@ Mappings:
ServiceName:
value: 'metaflow-ui-service'
ImageUrl:
value: 'netflixoss/metaflow_metadata_service:v2.2.3'
value: 'netflixoss/metaflow_metadata_service:v2.3.5'
ContainerPort:
value: 8083
ContainerCpu:
Expand All @@ -134,7 +134,7 @@ Mappings:
ServiceName:
value: 'metadata-ui-static'
ImageUrl:
value: 'public.ecr.aws/outerbounds/metaflow_ui:v1.1.1'
value: 'public.ecr.aws/outerbounds/metaflow_ui:v1.2.4'
ContainerPort:
value: 3000
ContainerCpu:
Expand Down Expand Up @@ -1127,7 +1127,7 @@ Resources:
echo 'export AWS_DEFAULT_REGION=${AWS::Region}' >> /etc/profile.d/jupyter-env.sh
echo 'export METAFLOW_DEFAULT_DATASTORE=s3' >> /etc/profile.d/jupyter-env.sh
echo 'export METAFLOW_DEFAULT_METADATA=service' >> /etc/profile.d/jupyter-env.sh
initctl restart jupyter-server --no-wait
systemctl restart jupyter-server
OnStart:
- Content:
Fn::Base64:
Expand Down
4 changes: 4 additions & 0 deletions azure/terraform/infra/credentials.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
data "azuread_client_config" "current" {}

# Add an application, a service principal, and a password for the service principal
# This single service principal have access to:
# - Metaflow's storage container
Expand All @@ -6,10 +8,12 @@
# E.g. an end user needs to be able to access Metaflow storage AND submit jobs to AKS (possibly)
resource "azuread_application" "service_principal_application" {
display_name = var.service_principal_name
owners = [data.azuread_client_config.current.object_id]
}

resource "azuread_service_principal" "service_principal" {
application_id = azuread_application.service_principal_application.application_id
owners = [data.azuread_client_config.current.object_id]
}

# This will be used as a AZURE_CLIENT_SECRET in Metaflow's AKS workloads
Expand Down
2 changes: 1 addition & 1 deletion azure/terraform/infra/database.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ resource "azurerm_postgresql_flexible_server" "metaflow_database_server" {

storage_mb = 32768

sku_name = "GP_Standard_D4s_v3"
sku_name = "B_Standard_B2s"
depends_on = [azurerm_private_dns_zone_virtual_network_link.metaflow_database_private_dns_zone_virtual_network_link]

}
Expand Down
2 changes: 1 addition & 1 deletion azure/terraform/infra/kubernetes.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ resource "azurerm_kubernetes_cluster" "metaflow_kubernetes" {
resource "azurerm_kubernetes_cluster_node_pool" "metaflow_kubernetes_compute_node_pool" {
name = "taskworkers"
kubernetes_cluster_id = azurerm_kubernetes_cluster.metaflow_kubernetes.id
vm_size = "Standard_D8_v5"
vm_size = "Standard_D4_v5"
node_count = 1
enable_auto_scaling = true
vnet_subnet_id = azurerm_subnet.metaflow_kubernetes_subnet.id
Expand Down
3 changes: 3 additions & 0 deletions azure/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,7 @@ module "services" {

airflow_version = local.airflow_version
airflow_frenet_secret = local.airflow_frenet_secret

argo_workflows_version = local.argo_workflows_version
argo_events_version = local.argo_events_version
}
6 changes: 3 additions & 3 deletions azure/terraform/services/argo.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ resource "kubernetes_namespace" "argo-events" {

locals {
is_windows = substr(pathexpand("~"), 0, 1) == "/" ? false : true
_argo_cmd = "kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo-workflows/master/manifests/quick-start-postgres.yaml"
_argo_events_cmd = "kubectl apply -f https://raw.githubusercontent.com/argoproj/argo-events/v1.7.3/manifests/install.yaml"
_argo_cmd = var.argo_workflows_version == "latest" ? "kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/latest/download/quick-start-postgres.yaml" : "kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/${var.argo_workflows_version}/quick-start-postgres.yaml"
_argo_events_cmd = "kubectl apply -f https://raw.githubusercontent.com/argoproj/argo-events/${var.argo_events_version}/manifests/install.yaml"
_service_accts_cmd = "kubectl apply -n argo -f ${path.module}/argo_events/service_accounts.yaml"
_event_bus_cmd = "kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo-events/v1.7.3/examples/eventbus/native.yaml"
_event_bus_cmd = "kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo-events/${var.argo_events_version}/examples/eventbus/native.yaml"
_webhook_source_cmd = "kubectl apply -n argo -f ${path.module}/argo_events/webhook_source.yaml"
}

Expand Down
8 changes: 8 additions & 0 deletions azure/terraform/services/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ variable "airflow_frenet_secret" {
type = string
}

variable "argo_workflows_version" {
type = string
}

variable "argo_events_version" {
type = string
}

variable "deploy_argo" {
type = bool
}
Expand Down
18 changes: 14 additions & 4 deletions azure/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ locals {
k8s_subnet_name = "snet-${var.org_prefix}-metaflow-k8s-${local.location}-${terraform.workspace}"

# Changeable after initial "terraform apply" (e.g. image upgrades, secret rotation)
metadata_service_image = "public.ecr.aws/outerbounds/metaflow_metadata_service:2.3.3"
metaflow_ui_static_service_image = "public.ecr.aws/outerbounds/metaflow_ui:v1.1.4"
metaflow_ui_backend_service_image = "public.ecr.aws/outerbounds/metaflow_metadata_service:2.3.3"
metadata_service_image = "public.ecr.aws/outerbounds/metaflow_metadata_service:2.3.6"
metaflow_ui_static_service_image = "public.ecr.aws/outerbounds/metaflow_ui:v1.2.4"
metaflow_ui_backend_service_image = "public.ecr.aws/outerbounds/metaflow_metadata_service:2.3.6"
metaflow_kubernetes_secret_name = "metaflow-azure-storage-credentials"

# Forever constants
Expand All @@ -30,9 +30,19 @@ locals {
metaflow_db_password = "metaflow" # DB is private, accessible only within vnet.
metaflow_db_port = 5432

# Airflow Related Options
# Changeable Airflow Related Options
# See https://airflow.apache.org/docs/apache-airflow/stable/release_notes.html
airflow_version = "2.3.3"
airflow_frenet_secret = "myverysecretvalue"

# Changeable Argo Related Options
# In non-production/test environments these versions may be set as "latest". If this repo is
# used as a starting point for your production k8s cluster used for Metaflow, then it is a best
# practice to set specific version numbers here that are tracked in your own version control repository.
# See https://github.com/argoproj/argo-workflows/releases
argo_workflows_version = "v3.4.4"
# See https://github.com/argoproj/argo-events/releases
argo_events_version = "v1.7.6"
}

# You MUST set this to ensure global (Azure-wide) uniqueness of:
Expand Down
108 changes: 68 additions & 40 deletions gcp/terraform/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions gcp/terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,35 @@ Then, apply the `services` module (deploys Metaflow services to GKE)

The step above will output next steps for Metaflow end users.

## Metaflow job orchestration options
The recommended way to orchestrate Metaflow workloads on Kubernetes is via [Argo Workflows](https://docs.metaflow.org/going-to-production-with-metaflow/scheduling-metaflow-flows/scheduling-with-argo-workflows). However, Airflow is also supported as an alternative.

The template also provides the `deploy_airflow` and `deploy_argo` flags as variables. These are booleans that specify if [Airflow](https://airflow.apache.org/) or [Argo Workflows](https://argoproj.github.io/argo-workflows/) will be deployed in the Kubernetes cluster along with Metaflow related services. By default `deploy_argo` is set to __true__ and `deploy_airflow` is set to __false__.
To change these, set them in your `FILE.tfvars` file (or else, via other [terraform variable](https://www.terraform.io/language/values/variables) passing mechanisms)

### Argo Workflows
Argo Workflows is installed by default on the AKS cluster as part of the `services` submodule. Setting the `deploy_argo` [variable](./variables.tf) will deploy Argo in the GKE cluster. No additional configuration is done in the `infra` module to support `argo`.

After you have changed the value of `deploy_argo`, re-apply terraform for both [infra and services](#usage).

### Airflow

**This is quickstart template only, not recommended for real production deployments**

If `deploy_airflow` is set to true, then the `services` module will deploy Airflow via a [helm chart](https://airflow.apache.org/docs/helm-chart/stable/index.html) into the kubernetes cluster (the one deployed by the `infra` module).

The terraform template deploys Airflow configured with a `LocalExecutor`. Metaflow can work with any Airflow executor. This template deploys the `LocalExecutor` for simplicity.

After you have changed the value of `deploy_airflow`, reapply terraform for both [infra and services](#usage).

#### Shipping Metaflow compiled DAGs to Airflow
Airflow expects Python files with Airflow DAGS present in the [dags_folder](https://airflow.apache.org/docs/apache-airflow/2.2.0/configurations-ref.html#dags-folder). By default this terraform template uses the [defaults](https://airflow.apache.org/docs/helm-chart/stable/parameters-ref.html#airflow) set in the Airflow helm chart which is `{AIRFLOW_HOME}/dags` (`/opt/airflow/dags`).

The metaflow-tools repository also ships a [airflow_dag_upload.py](../../scripts/airflow_dag_upload.py) file that can help sync Airflow dag file generated by Metaflow to the Airflow scheduler _deployed by this template_. Under the hood [airflow_dag_upload.py](../../scripts/airflow_dag_upload.py) uses the `kubectl cp` command to copy files from local to the Airflow scheduler's container. Example of how to use the file:
```
python airflow_dag_upload.py my-dag.py /opt/airflow/dags/my-dag.py
```

## (Advanced) Terraform state management
Terraform manages the state of GCP resources in [tfstate](https://www.terraform.io/language/state) files locally by default.

Expand Down
4 changes: 2 additions & 2 deletions gcp/terraform/infra/kubernetes.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ resource "google_container_cluster" "metaflow_kubernetes" {
enabled = true
resource_limits {
resource_type = "cpu"
minimum = 2
minimum = 1
maximum = 200
}
resource_limits {
resource_type = "memory"
minimum = 4
minimum = 2
maximum = 400
}
}
Expand Down
Loading