Work in Progress: This page is under development. Use the feedback button on the bottom right to help us improve it.

Delta Lake

Delta Lake connector for writing to Delta Lake tables.

Connection Profile

The Delta Lake connector does not require a connection profile. Storage credentials are configured directly in the table configuration via storage_options.

Connection Table

  • path (string, required): URI of the Delta Lake table (supports s3://, gs://, az://, or local paths)
  • storage_options (object, optional): Storage provider credentials and options
  • rolling_policy (object, optional): File rolling policy
    • file_size_bytes (integer, optional): Files rolled after reaching this size in bytes
    • interval_seconds (integer, optional): Seconds to wait before rolling to new file
    • inactivity_seconds (integer, optional): Seconds of inactivity before rolling to new file
  • file_naming (object, optional): Filename prefix/suffix and strategy
    • prefix (string, optional): Filename prefix
    • suffix (string, optional): Filename suffix (overwrites default .parquet)
    • strategy (string, optional): "serial" | "uuid" | "uuidV7" | "ulid"
  • multipart (object, optional): Multipart upload tuning for object stores
    • target_part_size_bytes (integer, optional): Target size per part (min 5242880)
    • max_parts (integer, optional): Max parts in multipart upload
  • partitioning (object, optional): Data partitioning configuration
    • time_pattern (string, optional): Pattern of date string (e.g., "year=%Y/month=%m/day=%d")
    • fields (array, optional): Partition field configurations
      • name (string, required): Field to partition by
      • transform (string, optional): "identity" | "hour" | "year" | "month"
    • shuffle_by_partition (object, required): Partition shuffle settings
      • enabled (boolean, optional): Enable partition key shuffling
  • type (string, required): "sink"

Storage Options

Common storage options by provider:

AWS S3:

  • aws_region - AWS region
  • aws_access_key_id - Access key ID
  • aws_secret_access_key - Secret access key

Google Cloud Storage:

  • gcs_project - GCP project ID

Azure Blob Storage:

  • azure_account - Storage account name
  • azure_key - Storage account key

JSON Schema Reference

Connection Profile Schema

{}

Connection Table Schema

{
  "type": "object",
  "additionalProperties": false,
  "properties": {
    "path": {
      "type": "string",
      "title": "Path",
      "description": "URI of the Delta Lake table to write to",
      "examples": ["s3://my-bucket/delta-tables/events/"]
    },
    "storage_options": {
      "type": "object",
      "additionalProperties": {"type": "string"},
      "title": "Storage Options",
      "description": "Storage provider options (e.g., AWS credentials, region settings)"
    },
    "rolling_policy": {
      "type": "object",
      "additionalProperties": false,
      "properties": {
        "file_size_bytes": {
          "type": "integer",
          "format": "uint64",
          "minimum": 0,
          "title": "File Size",
          "description": "Files will be rolled after reaching this number of bytes"
        },
        "interval_seconds": {
          "type": "integer",
          "format": "uint64",
          "minimum": 1,
          "title": "Interval Seconds",
          "description": "Number of seconds to wait before rolling over to a new file"
        },
        "inactivity_seconds": {
          "type": "integer",
          "format": "uint64",
          "minimum": 1,
          "title": "Inactivity Seconds",
          "description": "Number of seconds of inactivity to wait before rolling over to a new file"
        }
      },
      "title": "File Rolling Policy",
      "description": "Rolling policy for file sinks (when & why to close a file and open a new one)."
    },
    "file_naming": {
      "type": "object",
      "additionalProperties": false,
      "properties": {
        "prefix": {
          "type": "string",
          "title": "Filename Prefix",
          "description": "The prefix to use in file name. i.e prefix-<uuid>.parquet"
        },
        "suffix": {
          "type": "string",
          "title": "Filename Suffix",
          "description": "This will overwrite the default file suffix. i.e .parquet, use with caution"
        },
        "strategy": {
          "type": "string",
          "enum": ["serial", "uuid", "uuidV7", "ulid"],
          "title": "Filename Strategy",
          "description": "Filename generation strategy."
        }
      },
      "title": "File Naming",
      "description": "Controls filename prefix/suffix and strategy."
    },
    "multipart": {
      "type": "object",
      "additionalProperties": false,
      "properties": {
        "target_part_size_bytes": {
          "type": "integer",
          "format": "uint64",
          "minimum": 5242880,
          "title": "Target Part Size",
          "description": "Target size for each part of the multipart upload, in bytes"
        },
        "max_parts": {
          "type": "integer",
          "format": "uint64",
          "minimum": 1,
          "title": "Max Parts",
          "description": "Maximum number of parts to upload in a multipart upload"
        }
      },
      "title": "Multipart Upload Settings",
      "description": "Multipart upload tuning for object stores that need it."
    },
    "partitioning": {
      "type": "object",
      "additionalProperties": false,
      "properties": {
        "time_pattern": {
          "type": "string",
          "title": "Time Partition Pattern",
          "description": "The pattern of the date string (e.g., year=%Y/month=%m/day=%d)"
        },
        "fields": {
          "type": "array",
          "items": {
            "title": "Partition Field",
            "description": "Partition field configuration",
            "type": "object",
            "properties": {
              "name": {
                "title": "Field Name",
                "description": "The field to partition by",
                "type": "string"
              },
              "transform": {
                "title": "Transform",
                "description": "Transformation to apply (identity, hour, year, month)",
                "type": "string",
                "enum": ["identity", "hour", "year", "month"]
              }
            },
            "additionalProperties": false,
            "required": ["name"]
          },
          "title": "Partition Fields",
          "description": "Fields to partition the data by with transformations"
        },
        "shuffle_by_partition": {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "enabled": {
              "type": "boolean",
              "title": "Enable partition shuffling",
              "description": "If enabled, we will shuffle by the partition keys, which can reduce the number of files a sink produces; however this may cause backlog if data is skewed"
            }
          },
          "description": "Advanced tuning for hash shuffling of partition keys",
          "title": "Partition shuffle settings"
        }
      },
      "required": ["shuffle_by_partition"],
      "description": "Data layout partitioning for sinks."
    },
    "type": {"type": "string", "const": "sink"}
  },
  "required": ["type", "path"],
  "description": "Delta Lake sink definition."
}