Spark_属性数据补全代码

发布时间:2023年12月28日
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}

object DataCompletion {
  def main(args: Array[String]): Unit = {
   
    val data = Seq(
      ("001", "2022-01-01", "Harvard"),
      ("002", "2022-01-02", null),
      ("003", "2022-01-03", "Stanford"),
      ("004", "2022-01-04", ""),
      ("005", "2022-01-05", "MIT"),
      ("006", "2022-01-06", null)
    )

    import spark.implicits._
    val inputDf = data.toDF("id", "time", "school")

    val fillMissingSchoolName = udf((id: String, school: String) => {
      if (school == null || school.isEmpty) {
        "Unknown School"
      } else {
        school
      }
    })

    // 添加新的列来填充确实的学校名
    val updatedDf = inputDf.withColumn("filled_school", fillMissingSchoolName(col("id"), col("school")))

    updatedDf.show(false)
  }
}

文章来源:https://blog.csdn.net/qq_52128187/article/details/135264876
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。