ff =d.select(['dnum']).dropDuplicates()ff.count()ff.show()fff =d.select(['dnum']).distinct()
2、withColumn、lit、col withColumn增加一列
lit 指定列
col 选择列
import pyspark.sql.functions as Ftemp_df = temp_df.withColumn("date", F.lit(target_date))movie_feature_df = movie_feature_df.withColumn('tags', regexp_replace(col('tags'), "[", ""))
3、unionByName、groupByplay_video_df = Nonefor i in range(args.range): t = target_date - datetime.timedelta(days=i) temp_df = spark.sql( "select * from ***album where year=%s and month=%s and day=%s" % (t.year, t.month, t.day)) temp_df = temp_df.withColumn("date", F.lit(target_date)) if play_video_df == None: play_video_df = temp_df else: play_video_df = play_video_df.unionByName(temp_df)target_df = play_video_dftarget_groupped_movie_df = target_movie_df.groupBy("dnum", "aid").agg(F.max("finish_rate").alias("finish_rate"))