import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
object DataCompletion {
def main(args: Array[String]): Unit = {
val data = Seq(
("001", "2022-01-01", "Harvard"),
("002", "2022-01-02", null),
("003", "2022-01-03", "Stanford"),
("004", "2022-01-04", ""),
("005", "2022-01-05", "MIT"),
("006", "2022-01-06", null)
)
import spark.implicits._
val inputDf = data.toDF("id", "time", "school")
val fillMissingSchoolName = udf((id: String, school: String) => {
if (school == null || school.isEmpty) {
"Unknown School"
} else {
school
}
})
// 添加新的列来填充确实的学校名
val updatedDf = inputDf.withColumn("filled_school", fillMissingSchoolName(col("id"), col("school")))
updatedDf.show(false)
}
}