Work in Progress: This page is under development. Use the feedback button on the bottom right to help us improve it.

Apache Iceberg

Apache Iceberg connector for writing to Iceberg tables.

Connection Profile

  • catalog (required): Catalog configuration. One of:
    • REST catalog (type: "rest"):
      • url (string, required): Base URL for the REST catalog
      • warehouse (string, optional): The Warehouse to connect to
      • token (string, optional): Authentication token
    • Glue catalog (type: "glue"):
      • glue_catalog_config (required):
        • region (string, required): AWS region
        • warehouse (string, optional): S3 warehouse path (e.g., s3://bucket/warehouse)
        • access_key_id (string, optional): AWS access key ID
        • secret_access_key (string, optional): AWS secret access key
        • endpoint (string, optional): Custom AWS endpoint

Connection Table

  • type (string, required): "sink"
  • sink_table_config (required): Sink table configuration
    • table_name (string, required): Table name
    • namespace (string, optional): Table namespace
    • location_path (string, optional): Data file location
    • rolling_policy (object, optional): File rolling policy
      • file_size_bytes (integer, optional): Files rolled after reaching this size in bytes
      • interval_seconds (integer, optional): Seconds to wait before rolling to new file
      • inactivity_seconds (integer, optional): Seconds of inactivity before rolling to new file
    • file_naming (object, optional): Filename prefix/suffix and strategy
      • prefix (string, optional): Filename prefix
      • suffix (string, optional): Filename suffix (overwrites default .parquet)
      • strategy (string, optional): "serial" | "uuid" | "uuidV7" | "ulid"
    • multipart (object, optional): Multipart upload tuning for object stores
      • target_part_size_bytes (integer, optional): Target size per part (min 5242880)
      • max_parts (integer, optional): Max parts in multipart upload
    • partitioning (object, optional): Data partitioning configuration
      • time_pattern (string, optional): Pattern of date string
      • fields (array, optional): Partition field configurations
        • name (string, required): Field to partition by
        • transform (string, optional): "identity" | "hour" | "year" | "month"
      • shuffle_by_partition (object, required): Partition shuffle settings
        • enabled (boolean, optional): Enable partition key shuffling
    • storage_options (object, optional): See FileSystem connector for full list of options

JSON Schema Reference

Connection Profile Schema

{
  "type": "object",
  "additionalProperties": false,
  "properties": {
    "catalog": {
      "oneOf": [
        {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "type": {"type": "string", "const": "rest"},
            "url": {
              "type": "string",
              "description": "Base URL for the REST catalog",
              "examples": ["http://localhost:8001/iceberg"],
              "format": "uri"
            },
            "warehouse": {
              "type": "string",
              "description": "The Warehouse to connect to",
              "examples": ["16ba210d70caae96ecb1f6e17afe6f3b_my-bucket"]
            },
            "token": {
              "type": "string",
              "format": "var-str",
              "description": "Authentication token"
            }
          },
          "required": ["type", "url"],
          "description": "REST catalog connector for Apache Iceberg."
        },
        {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "type": {"type": "string", "const": "glue"},
            "glue_catalog_config": {
              "type": "object",
              "properties": {
                "region": {
                  "type": "string",
                  "description": "AWS region"
                },
                "warehouse": {
                  "type": "string",
                  "description": "S3 warehouse path",
                  "examples": ["s3://my-bucket/warehouse"]
                },
                "access_key_id": {
                  "type": "string",
                  "description": "AWS access key ID"
                },
                "secret_access_key": {
                  "type": "string",
                  "description": "AWS secret access key"
                },
                "endpoint": {
                  "type": "string",
                  "description": "Custom AWS endpoint"
                }
              },
              "required": ["region"]
            }
          },
          "required": ["type", "glue_catalog_config"],
          "description": "AWS Glue catalog connector for Apache Iceberg."
        }
      ]
    }
  },
  "required": ["catalog"]
}

Connection Table Schema

{
  "type": "object",
  "additionalProperties": false,
  "properties": {
    "type": {"type": "string", "const": "sink"},
    "sink_table_config": {
      "type": "object",
      "additionalProperties": false,
      "properties": {
        "table_name": {
          "type": "string",
          "title": "Table Name",
          "description": "Table name"
        },
        "namespace": {
          "type": "string",
          "title": "Namespace",
          "description": "Table namespace"
        },
        "location_path": {
          "type": "string",
          "title": "Location Path",
          "description": "Data file location"
        },
        "storage_options": {
          "type": "object",
          "additionalProperties": {"type": "string"},
          "title": "Storage Options",
          "description": "See the FileSystem connector docs for the full list of options"
        },
        "rolling_policy": {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "file_size_bytes": {
              "type": "integer",
              "format": "uint64",
              "minimum": 0,
              "title": "File Size",
              "description": "Files will be rolled after reaching this number of bytes"
            },
            "interval_seconds": {
              "type": "integer",
              "format": "uint64",
              "minimum": 1,
              "title": "Interval Seconds",
              "description": "Number of seconds to wait before rolling over to a new file"
            },
            "inactivity_seconds": {
              "type": "integer",
              "format": "uint64",
              "minimum": 1,
              "title": "Inactivity Seconds",
              "description": "Number of seconds of inactivity to wait before rolling over to a new file"
            }
          },
          "title": "File Rolling Policy",
          "description": "Rolling policy for file sinks (when & why to close a file and open a new one)."
        },
        "file_naming": {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "prefix": {
              "type": "string",
              "title": "Filename Prefix",
              "description": "The prefix to use in file name. i.e prefix-<uuid>.parquet"
            },
            "suffix": {
              "type": "string",
              "title": "Filename Suffix",
              "description": "This will overwrite the default file suffix. i.e .parquet, use with caution"
            },
            "strategy": {
              "type": "string",
              "enum": ["serial", "uuid", "uuidV7", "ulid"],
              "title": "Filename Strategy",
              "description": "Filename generation strategy."
            }
          },
          "title": "File Naming",
          "description": "Controls filename prefix/suffix and strategy."
        },
        "multipart": {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "target_part_size_bytes": {
              "type": "integer",
              "format": "uint64",
              "minimum": 5242880,
              "title": "Target Part Size",
              "description": "Target size for each part of the multipart upload, in bytes"
            },
            "max_parts": {
              "type": "integer",
              "format": "uint64",
              "minimum": 1,
              "title": "Max Parts",
              "description": "Maximum number of parts to upload in a multipart upload"
            }
          },
          "title": "Multipart Upload Settings",
          "description": "Multipart‑upload tuning for object stores that need it."
        },
        "partitioning": {
          "type": "object",
          "additionalProperties": false,
          "properties": {
            "time_pattern": {
              "type": "string",
              "title": "Time Partition Pattern",
              "description": "The pattern of the date string"
            },
            "fields": {
              "type": "array",
              "items": {
                "title": "Partition Field",
                "description": "Partition field configuration",
                "type": "object",
                "properties": {
                  "name": {
                    "title": "Field Name",
                    "description": "The field to partition by",
                    "type": "string"
                  },
                  "transform": {
                    "title": "Transform",
                    "description": "Transformation to apply (identity, hour, year, month)",
                    "type": "string",
                    "enum": ["identity", "hour", "year", "month"]
                  }
                },
                "additionalProperties": false,
                "required": ["name"]
              },
              "title": "Partition Fields",
              "description": "Fields to partition the data by with transformations"
            },
            "shuffle_by_partition": {
              "type": "object",
              "additionalProperties": false,
              "properties": {
                "enabled": {
                  "type": "boolean",
                  "title": "Enable partition shuffling",
                  "description": "If enabled, we will shuffle by the partition keys, which can reduce the number of files a sink produces; however this may cause backlog if data is skewed"
                }
              },
              "description": "Advanced tuning for hash shuffling of partition keys",
              "title": "Partition shuffle settings"
            }
          },
          "required": ["shuffle_by_partition"],
          "description": "Data‑layout partitioning for sinks."
        }
      },
      "required": ["table_name"]
    }
  },
  "required": ["type", "sink_table_config"],
  "description": "Iceberg sink definition."
}