Top tag Map Reducer Code

The top 10 tags used in posts, ordered by the number of questions they appear in.
forum csv file contains
“id”    “title”    “tagnames”    “author_id”    “body”    “node_type”    “parent_id”    “abs_parent_id”    “added_at”    “score”    “state_string”    “last_edited_id”    “last_activity_by_id”    “last_activity_at”    “active_revision_id”    “extra”    “extra_ref_id”    “extra_count”    “marked”

Mapper result Reducer result
application question
board question
browsers question
bug question
cs101 question
cs101 question
cs101 answer
cs101 question
cs101 answer
cs101 answer
cs101 answer
cs101 question
cs101 question
cs101 question
cs101 question
cs101 question
cs212 question
cs253 question
cs253 question
cs253 answer
cs253 answer
cs253 answer
cs253 question
Tags from questions only:
Tag Counts
cs101 11622
cs373 4952
cs253 4542
discussion 3560
meta 2664
cs212 2009
homework 1682
bug 1651
cs262 1561
st101 1489

Tags from questions, answers and comments:
Tag Counts
cs101 82743
cs373 35658
cs253 30605
cs212 14302
st101 9977
cs262 8719
ph100 7444
cs215 5501
cs387 5174
discussion 3560

#!/usr/bin/python
import sys
import csv
import collections

def mapper():
    reader = csv.reader(sys.stdin,delimiter='\t')
    reader.next()
    for line in reader:
        tagnames= line[2]
        node_type= line[5]
        for tn in tagnames.split():
            print tn, "\t", node_type
  
def reducer():
    oldTagname = None
    qac = 0
    q = 0
    tagqac = collections.defaultdict(list)
    tagq = collections.defaultdict(list)
    reader = csv.reader(sys.stdin,delimiter='\t')
    for line in reader:
        newTagname, newNodetype = line
        if len(line)!=2:
            continue
        if oldTagname and oldTagname !=  newTagname:
            tagqac[qac].append(oldTagname)
            tagq[q].append(oldTagname)
            qac = 0
            q = 0
       
        oldTagname = newTagname
        if (newNodetype=='question' or newNodetype=='comment' or
            newNodetype=='answer'):
            qac+=1
            if (newNodetype=='question'):
                q+=1
    if oldTagname:
        tagqac[qac].append(oldTagname)
        tagq[q].append(oldTagname)

    print "Tags from questions, answers and comments: "
    print "Tag\tCounts"
    Toptag(tagqac)
    print
    print "Tags from questions only: "
    print "Tag\tCounts"
    Toptag(tagq)
   
def Toptag(tt):
    tts = sorted(tt.keys())
    i=0
    for c in tts[::-1]:
        for tn in tt[c]:
            if(i==10):
                break
            i+=1           
            print tn,"\t",c
Advertisement

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: