常规操作
# 创建myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")words = spark.sparkContext.parallelize(myCollection, 2)keyword = words.keyBy(lambda word:word.lower()[0])keyword.mapValues(lambda word: word.upper()).collect()[('s', 'SPARK'),('t', 'THE'),('d', 'DEFINITIVE'),('g', 'GUIDE'),(':', ':'),('b', 'BIG'),('d', 'DATA'),('p', 'PROCESSING'),('m', 'MADE'),('s', 'SIMPLE')]# look up the result for particular keykeyword.lookup("s")['Spark', 'Simple']#sampleByKey#sample an RDD by a set of keys#RDD.sampleByKey(withReplacement, fractions, seed=None)[source]# 第一个是是否有放回,第二个是概率,第三个是随机数种子# 这个没法确认返回子集的大小import random## extract characters in wordsdistinctChars = words.flatMap(lambda word:list(word.lower()) .distinct() .collect() sampleMap = dict(map(lambda c:(c, random.random()), distinctChars))words.map(lambda word: (word.lower()[0], word)) .sampleByKey(True, sampleMap, 6) .collect()