Demystifying the Data Flow: A Guide to Scenario-Based Azure Data Engineering Interviews

4 min readFeb 25, 2024

you have a PySpark DataFrame named employee_data with the following columns: employee_id, department, salary, and hire_date.
You need to find the highest salary in each department and calculate the percentage of each employee’s salary compared to the department’s highest salary.

Data:

+-----------+--------------+------+----------+
|employee_id|    department|salary| hire_date|
+-----------+--------------+------+----------+
|        101|         Sales|  5125|02-02-2013|
|        102|         Sales|  5529|31-08-2013|
|        103|       Finance|  5125|16-06-2014|
|        104|     Logistics|  5329|10-07-2014|
|        105|Human Resource|  5529|16-07-2014|
|        106|         Sales|  5423|25-07-2014|
|        107|         Sales|  5125|31-07-2014|
|        108|       Finance|  5516|11-08-2014|
|        109|       Finance|  5564|11-08-2014|
|        110|     Logistics|  5446|12-08-2014|
+-----------+--------------+------+----------+

#caluclating maximum salary
df1=df.select("department","salary").groupBy(col("department")).max("salary").withColumnRenamed('max(salary)','max_salary')
#joining df to old and creating df
df2=df.alias("E").join(df1.alias("M"),col('E.department')==col('M.department'),'left').select('E.employee_id',column('E.department').alias("Dept"),'E.salary','E.hire_date','M.max_salary')
#caluclating percentage
df3 = df2.withColumn('salary_percentage', F.col('salary') / F.col('max_salary') * 100)
df3.select("employee_id","dept","salary","max_salary","salary_percentage").show()

#Output
+-----------+--------------+------+----------+-----------------+
|employee_id|          dept|salary|max_salary|salary_percentage|
+-----------+--------------+------+----------+-----------------+
|        101|         Sales|  5125|      5529|92.69307288840658|
|        102|         Sales|  5529|      5529|            100.0|
|        103|       Finance|  5125|      5564| 92.1099928109274|
|        104|     Logistics|  5329|      5446|97.85163422695557|
|        105|Human Resource|  5529|      5529|            100.0|
|        106|         Sales|  5423|      5529|98.08283595586906|
|        107|         Sales|  5125|      5529|92.69307288840658|
|        108|       Finance|  5516|      5564|  99.137311286844|
|        109|       Finance|  5564|      5564|            100.0|
|        110|     Logistics|  5446|      5446|            100.0|
+-----------+--------------+------+----------+-----------------+

2.jsonString = ‘{“id”:”2",”Name”:”XYZ”,”City”:”Chn”}’
data = [(1, ‘{“id”:”1",”Name”:”ABC”,”City”:”Chn”}’), (2, jsonString)]
# Output
Tuple 1:
{“id”:”1",”Name”:”XYZ”,”City”:”Chn”}
Tuple 2:
{“id”:”2",”Name”:”AB”,”City”:”Chn”}

import json

def process_data(data):
    """Processes a list of tuples, extracts JSON data, and corrects a potential error.

    Args:
        data: A list of tuples containing integers and JSON strings.

    Returns:
        A list of tuples where the second element in each tuple is the parsed JSON data.
    """

    processed_data = []

    for index, (key, value) in enumerate(data):
        try:
            # Attempt to parse JSON with error handling
            parsed_data = json.loads(value)

            # Correct potential error if "AB" was intended instead of "XYZ"
            if parsed_data.get("Name") == "XYZ":
                parsed_data["Name"] = "AB"

            processed_data.append((key, parsed_data))
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON data for tuple {index + 1}: {e}")

    return processed_data

processed_data = process_data(data)

for key, value in processed_data:
    print(f"Tuple {key}:")
    print(json.dumps(value, indent=4))

#output
Tuple 1:
{
    "id": "1",
    "Name": "ABC",
    "City": "Chn"
}
Tuple 2:
{
    "id": "2",
    "Name": "AB",
    "City": "Chn"
}

3.Country
Ind
SL
Pak
Ban
Write a query to get the below output:
Output Expected:
Ind-SL
Ind-Pak
Ind-Ban
SL-Pak
SL-Ban
Pak-Ban

create table data (Sno int,country varchar(20));
insert into data (sno,country) values (1,'Ind'),(2,'Sl'),(3,'Pak'),(4,'Ban');
#Query
select concat(a.country,'-',b.country) from data as a cross join data as b on a.sno<b.sno order by a.sno;

4.Write a query to get count of employee under each manager from the below employee table.

with temp as
(select a.employee_id,a.first_name,b.first_name as manager_name from employee as a join employee as b on a.manager_id=b.employee_id)
select manager_name,count('*') from temp group by manager_name;

5.A
id
1
1
1
null
null

B
id
1
1
null
null
null

For above 2 table what is the output when joined left,right,inner

6.[1, 3, 2, 4, 5, 6] write to code to print highest number

a=[1, 3, 2, 4, 5, 6]
highestnumber = max(a)
print(highestnumber)

7.product qtr1 qtr2 qtr3 qtr4
A 5000 4000 6000 1000
B 8000 4000 6000 2000
create the above dataframe

data=[('A',5000,'4000','6000','1000'),('B','8000','4000','6000','2000')]
schema = structtype([Structfeild('prodcut', StringType(),True),
      Structfeild('qtr1', IntegerType(),True),
      Structfeild('qtr2', IntegerType(),True),
      Structfeild('qtr3', IntegerType(),True),
      Structfeild('qtr4', IntegerType(),True),
      ])
df =spark.createdataframe(data,schema=schema)
df.show()

Demystifying the Data Flow: A Guide to Scenario-Based Azure Data Engineering Interviews

Written by Bysanitribhuvan