A practical introduction to Spark’s Column- part 1

A table in Spark SQL
scala> val got = Seq((1,"Bran"),(2,"Jon")).toDF("id","name")
got: org.apache.spark.sql.DataFrame = [id: int, name: string]
scala> got.show
+---+----+
| id|name|
+---+----+
| 1|Bran|
| 2| Jon|
+---+----+

To access the column from a dataset/dataframe:

scala>  import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.col
scala> got.select(col("id"))
res17: org.apache.spark.sql.DataFrame = [id: int]
scala> import org.apache.spark.sql.functions.column
import org.apache.spark.sql.functions.column
scala> got.select(column("id"))
res18: org.apache.spark.sql.DataFrame = [id: int]
scala> import spark.implicits.StringToColumn
import spark.implicits.StringToColumn
scala> got.select($"id")
res19: org.apache.spark.sql.DataFrame = [id: int]
scala> import spark.implicits.symbolToColumn
import spark.implicits.symbolToColumn
scala> got.select('id)
res20: org.apache.spark.sql.DataFrame = [id: int]
scala> got.select('id).show
+---+
| id|
+---+
| 1|
| 2|
+---+
scala> import scala.reflect.runtime.universe.reify
import scala.reflect.runtime.universe.reify
scala> println(reify(Seq((1,"Bran"),(2,"Jon")).toDF("id","name")))
Expr[org.apache.spark.sql.DataFrame]($iw.$line3$read.$iw.$iw.spark.implicits.localSeqToDatasetHolder(Seq.apply(Tuple2.apply(1, "Bran"), Tuple2.apply(2, "Jon")))($iw.$line3$read.$iw.$iw.spark.implicits.newProductEncoder(Predef.this.implicitly)).toDF("id", "name"))
scala> val name = $"name".as[String]
name: org.apache.spark.sql.TypedColumn[Any,String] = name
scala> val name = $"name"
name: org.apache.spark.sql.ColumnName = name
!==, +, <=, >, apply, asc_nulls_last, bitwiseXOR, desc_nulls_first, eqNullSafe, expr, gt, isNotNull, like, multiply, otherwise, startsWith, unary_-, %, -, <=>, >=, as, between, cast, desc_nulls_last, equalTo, geq, hashCode, isNull, lt, name, over, substr, when, &&, /, =!=, alias, asc, bitwiseAND, contains, divide, equals, getField, isInCollection, isin, minus, notEqual, plus, toString, ||, *, <, ===, and, asc_nulls_first, bitwiseOR, desc, endsWith, explain, getItem, isNaN, leq, mod, or, rlike, unary_!
scala>  val gotData = Seq((101,"Bran",10,"Stark"),(221,"Jon",16,null),(11,"Ned",50,"Stark"),(21,"Tyrion",40,"Lanister")).toDF("id","name","age","house")
gotData: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> gotData.show
+---+------+---+--------+
| id| name|age| house|
+---+------+---+--------+
|101| Bran| 10| Stark|
|221| Jon| 16| null|
| 11| Ned| 50| Stark|
| 21|Tyrion| 40|Lanister|
+---+------+---+--------+

1) +

scala> gotData.withColumn("char_current_age", col("age").+(8)).show()scala> gotData.withColumn("char_current_age", col("age") +8 ).show()+---+------+---+--------+----------------+
| id| name|age| house|char_current_age|
+---+------+---+--------+----------------+
|101| Bran| 10| Stark| 18|
|221| Jon| 16| null| 24|
| 11| Ned| 50| Stark| 58|
| 21|Tyrion| 40|Lanister| 48|
+---+------+---+--------+----------------+
scala> gotData.select( gotData("age") + gotData("id") ).show
+----------+
|(age + id)|
+----------+
| 111|
| 237|
| 61|
| 61|
+----------+
scala> gotData.select( gotData("age") plus gotData("id") )

2) -

scala> gotData.withColumn("older_than_bran", col("age") - 10  ).show()
+---+------+---+--------+---------------+
| id| name|age| house|older_than_bran|
+---+------+---+--------+---------------+
|101| Bran| 10| Stark| 0|
|221| Jon| 16| null| 6|
| 11| Ned| 50| Stark| 40|
| 21|Tyrion| 40|Lanister| 30|
+---+------+---+--------+---------------+
scala> gotData.withColumn("older_than_bran", col("age") minus 10 )
scala> gotData.withColumn("multiply_10", col("age") *  10)
scala> gotData.withColumn("multiply_10", col("age") multiply 10)
scala> gotData.withColumn("divide_10", col("age") / 10)
scala> gotData.withColumn("divide_10", col("age") divide 10)
scala> gotData.withColumn("mod_10", col("age") mod 10)
scala> gotData.withColumn("mod_10", col("age") % 10)

4) isin

scala> val teenage = (15 to 25)
teenage: scala.collection.immutable.Range.Inclusive = Range(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)
scala> gotData.withColumn("teenage",col("age").isin(teenage:_*)).show()
+---+------+---+--------+-------+
| id| name|age| house|teenage|
+---+------+---+--------+-------+
|101| Bran| 10| Stark| false|
|221| Jon| 16| null| true|
| 11| Ned| 50| Stark| false|
| 21|Tyrion| 40|Lanister| false|
+---+------+---+--------+-------+
scala> gotData.withColumn("teenage",col("age").isin(15 to 25:_*))
scala> gotData.withColumn("is_Stark",col("house").isin("Stark"))
scala>   gotData.withColumn("teenage",col("age").isInCollection(15 to 25))

5) like

scala> gotData.withColumn("starts with B",col("name").like("B%")).show()
+---+------+---+--------+-------------+
| id| name|age| house|starts with B|
+---+------+---+--------+-------------+
|101| Bran| 10| Stark| true|
|221| Jon| 16| null| false|
| 11| Ned| 50| Stark| false|
| 21|Tyrion| 40|Lanister| false|
+---+------+---+--------+-------------+
scala> gotData.withColumn("has O ",col("name").like("%o%")).show()
+---+------+---+--------+------+
| id| name|age| house|has O |
+---+------+---+--------+------+
|101| Bran| 10| Stark| false|
|221| Jon| 16| null| true|
| 11| Ned| 50| Stark| false|
| 21|Tyrion| 40|Lanister| true|
+---+------+---+--------+------+
scala> gotData.withColumn("has O ",col("name").rlike("^(Bran|Jon)")) .show()
+---+------+---+--------+------+
| id| name|age| house|has O |
+---+------+---+--------+------+
|101| Bran| 10| Stark| true|
|221| Jon| 16| null| true|
| 11| Ned| 50| Stark| false|
| 21|Tyrion| 40|Lanister| false|
+---+------+---+--------+------+
gotData.withColumn("has O ",!col("name").like("%o%"))
gotData.withColumn("has O ",!col("name").rlike("^(Bran|Jon)"))

6) isNull and isNotNull

scala>gotData.withColumn("house_unknown",col("house").isNull).show()
+---+------+---+--------+-------------+
| id| name|age| house|house_unknown|
+---+------+---+--------+-------------+
|101| Bran| 10| Stark| false|
|221| Jon| 16| null| true|
| 11| Ned| 50| Stark| false|
| 21|Tyrion| 40|Lanister| false|
+---+------+---+--------+-------------+
scala>gotData.withColumn("houseNotnull",col("house").isNotNull)
.show()

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store