Optimize parquet file size in Spark and ingest into Azure data explorer using Azure Synapse Spark

Optimize parquet file size in 1GB chunks for analytics


Synapse Spark

from pyspark.sql import SparkSession
# Azure storage access info
blob_account_name = 'xxxxxxx' # replace with your blob name
blob_container_name = 'xxxxxxxx' # replace with your container name
blob_relative_path = '' # replace with your relative folder path
linked_service_name = 'BenchMarkLogs' # replace with your linked service name
#blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
blob_sas_token = mssparkutils.credentials.getSecret("iamkeys", "benchmarklogs")
# Allow SPARK to access from Blob remotelywasb_path = 'wasbs://' % (blob_container_name, blob_account_name, blob_relative_path)spark.conf.set('' % (blob_container_name, blob_account_name), blob_sas_token)
print('Remote blob path: ' + wasb_path)
df =
spark.conf.set("", "true")
spark.conf.set("", "1073741824")
spark.conf.set("spark.sql.files.maxPartitionBytes", "1073741824")
kustoappid = mssparkutils.credentials.getSecret("iamkeys", "kustoappid")
kustosecret = mssparkutils.credentials.getSecret("iamkeys", "kustosecret")
kustotenant = mssparkutils.credentials.getSecret("iamkeys", "kustotenant")
df1 ="abfss://")
df1.write. \
format(""). \
option("kustoCluster","clustername.region"). \
option("kustoDatabase","Benchmark"). \
option("kustoTable", "logspark"). \
option("kustoAadAppId",kustoappid). \
option("kustoAadAppSecret",kustosecret). \
option("kustoAadAuthorityID",kustotenant). \
option("tableCreateOptions","CreateIfNotExist"). \
mode("Append"). \
df1.write \
.format("") \
.option("spark.synapse.linkedService", "linkedsvcname") \
.option("kustoDatabase", "Benchmark") \
.option("kustoTable", "logspark") \
.option("tableCreateOptions","CreateIfNotExist") \
.mode("Append") \



