spark键值对的链接

join连接

from pyspark import SparkContext

if __name__ == "__main__":
    master = "local"
    if len(sys.argv) == 2:
        master = sys.argv[1]
    try:
        sc.stop() 
    except:
        pass
    sc = SparkContext(master, 'test')
    storeAddress = sc.parallelize( (("Ritual", "1026 Valencia St"), 
                    ("Philz", "748 Van Ness Ave"),   
                    ("Philz", "3101 24th St"), 
                    ("Starbucks", "Seattle"))) 
 
    storeRating =sc.parallelize( (("Ritual", 4.9), 
                                  ("Philz", 4.8)))
    result = storeAddress.join(storeRating) 
    print(result.collect())

输出:(这是内连接)

[('Ritual', ('1026 Valencia St', 4.9)), 
('Philz', ('748 Van Ness Ave', 4.8)), 
('Philz', ('3101 24th St', 4.8))]

leftOuterJoin左连接

from pyspark import SparkContext

if __name__ == "__main__":
    master = "local"
    if len(sys.argv) == 2:
        master = sys.argv[1]
    try:
        sc.stop() 
    except:
        pass
    sc = SparkContext(master, 'test')
    storeAddress = sc.parallelize( (("Ritual", "1026 Valencia St"), 
                    ("Philz", "748 Van Ness Ave"),   
                    ("Philz", "3101 24th St"), 
                    ("Starbucks", "Seattle"))) 
 
    storeRating =sc.parallelize( (("Ritual", 4.9), 
                                  ("Philz", 4.8)))
    result = storeAddress.leftOuterJoin(storeRating) 
    print(result.collect())

输出:相当于左外连接

[('Ritual', ('1026 Valencia St', 4.9)), 
('Philz', ('748 Van Ness Ave', 4.8)), 
('Philz', ('3101 24th St', 4.8)), 
('Starbucks', ('Seattle', None))]

rightOuterJoin右连接

from pyspark import SparkContext

if __name__ == "__main__":
    master = "local"
    if len(sys.argv) == 2:
        master = sys.argv[1]
    try:
        sc.stop() 
    except:
        pass
    sc = SparkContext(master, 'test')
    storeAddress = sc.parallelize( (("Ritual", "1026 Valencia St"), 
                    ("Philz", "748 Van Ness Ave"),   
                    ("Philz", "3101 24th St"), 
                    ("Starbucks", "Seattle"))) 
 
    storeRating =sc.parallelize( (("Ritual", 4.9), 
                                  ("Philz", 4.8)))
    result = storeAddress.rightOuterJoin(storeRating) 
    print(result.collect())

输出:

[('Ritual', ('1026 Valencia St', 4.9)), 
('Philz', ('748 Van Ness Ave', 4.8)), 
('Philz', ('3101 24th St', 4.8))]

你可能感兴趣的:(spark键值对的链接)