Demystifying the Data Flow: A Guide to Scenario-Based Azure Data Engineering Interviews
- you have a PySpark DataFrame named employee_data with the following columns: employee_id, department, salary, and hire_date.
You need to find the highest salary in each department and calculate the percentage of each employee’s salary compared to the department’s highest salary.
Data:
+-----------+--------------+------+----------+
|employee_id| department|salary| hire_date|
+-----------+--------------+------+----------+
| 101| Sales| 5125|02-02-2013|
| 102| Sales| 5529|31-08-2013|
| 103| Finance| 5125|16-06-2014|
| 104| Logistics| 5329|10-07-2014|
| 105|Human Resource| 5529|16-07-2014|
| 106| Sales| 5423|25-07-2014|
| 107| Sales| 5125|31-07-2014|
| 108| Finance| 5516|11-08-2014|
| 109| Finance| 5564|11-08-2014|
| 110| Logistics| 5446|12-08-2014|
+-----------+--------------+------+----------+
#caluclating maximum salary
df1=df.select("department","salary").groupBy(col("department")).max("salary").withColumnRenamed('max(salary)','max_salary')
#joining df to old and creating df
df2=df.alias("E").join(df1.alias("M"),col('E.department')==col('M.department'),'left').select('E.employee_id',column('E.department').alias("Dept"),'E.salary','E.hire_date','M.max_salary')
#caluclating percentage
df3 = df2.withColumn('salary_percentage', F.col('salary') / F.col('max_salary') * 100)
df3.select("employee_id","dept","salary","max_salary","salary_percentage").show()
#Output
+-----------+--------------+------+----------+-----------------+
|employee_id| dept|salary|max_salary|salary_percentage|
+-----------+--------------+------+----------+-----------------+
| 101| Sales| 5125| 5529|92.69307288840658|
| 102| Sales| 5529| 5529| 100.0|
| 103| Finance| 5125| 5564| 92.1099928109274|
| 104| Logistics| 5329| 5446|97.85163422695557|
| 105|Human Resource| 5529| 5529| 100.0|
| 106| Sales| 5423| 5529|98.08283595586906|
| 107| Sales| 5125| 5529|92.69307288840658|
| 108| Finance| 5516| 5564| 99.137311286844|
| 109| Finance| 5564| 5564| 100.0|
| 110| Logistics| 5446| 5446| 100.0|
+-----------+--------------+------+----------+-----------------+
2.jsonString = ‘{“id”:”2",”Name”:”XYZ”,”City”:”Chn”}’
data = [(1, ‘{“id”:”1",”Name”:”ABC”,”City”:”Chn”}’), (2, jsonString)]
# Output
Tuple 1:
{“id”:”1",”Name”:”XYZ”,”City”:”Chn”}
Tuple 2:
{“id”:”2",”Name”:”AB”,”City”:”Chn”}
import json
def process_data(data):
"""Processes a list of tuples, extracts JSON data, and corrects a potential error.
Args:
data: A list of tuples containing integers and JSON strings.
Returns:
A list of tuples where the second element in each tuple is the parsed JSON data.
"""
processed_data = []
for index, (key, value) in enumerate(data):
try:
# Attempt to parse JSON with error handling
parsed_data = json.loads(value)
# Correct potential error if "AB" was intended instead of "XYZ"
if parsed_data.get("Name") == "XYZ":
parsed_data["Name"] = "AB"
processed_data.append((key, parsed_data))
except json.JSONDecodeError as e:
print(f"Error parsing JSON data for tuple {index + 1}: {e}")
return processed_data
processed_data = process_data(data)
for key, value in processed_data:
print(f"Tuple {key}:")
print(json.dumps(value, indent=4))
#output
Tuple 1:
{
"id": "1",
"Name": "ABC",
"City": "Chn"
}
Tuple 2:
{
"id": "2",
"Name": "AB",
"City": "Chn"
}
3.Country
Ind
SL
Pak
Ban
Write a query to get the below output:
Output Expected:
Ind-SL
Ind-Pak
Ind-Ban
SL-Pak
SL-Ban
Pak-Ban
create table data (Sno int,country varchar(20));
insert into data (sno,country) values (1,'Ind'),(2,'Sl'),(3,'Pak'),(4,'Ban');
#Query
select concat(a.country,'-',b.country) from data as a cross join data as b on a.sno<b.sno order by a.sno;
4.Write a query to get count of employee under each manager from the below employee table.
with temp as
(select a.employee_id,a.first_name,b.first_name as manager_name from employee as a join employee as b on a.manager_id=b.employee_id)
select manager_name,count('*') from temp group by manager_name;
5.A
id
1
1
1
null
null
B
id
1
1
null
null
null
For above 2 table what is the output when joined left,right,inner
6.[1, 3, 2, 4, 5, 6] write to code to print highest number
a=[1, 3, 2, 4, 5, 6]
highestnumber = max(a)
print(highestnumber)
7.product qtr1 qtr2 qtr3 qtr4
A 5000 4000 6000 1000
B 8000 4000 6000 2000
create the above dataframe
data=[('A',5000,'4000','6000','1000'),('B','8000','4000','6000','2000')]
schema = structtype([Structfeild('prodcut', StringType(),True),
Structfeild('qtr1', IntegerType(),True),
Structfeild('qtr2', IntegerType(),True),
Structfeild('qtr3', IntegerType(),True),
Structfeild('qtr4', IntegerType(),True),
])
df =spark.createdataframe(data,schema=schema)
df.show()