Graphs

graph
Clustering Coefficient
clusting coefficient

The clustering coefficient for graph is just the average of the clustering coefficients of the nodes in the graph.

flights = [("ORD", "SEA"), ("ORD", "LAX"), ('ORD', 'DFW'), ('ORD', 'PIT'),
           ('SEA', 'LAX'), ('LAX', 'DFW'), ('ATL', 'PIT'), ('ATL', 'RDU'),
           ('RDU', 'PHL'), ('PIT', 'PHL'), ('PHL', 'PVD')]

G = {}
for (x,y) in flights: make_link(G,x,y)

def clustering_coefficient(G,v):
    neighbors = G[v].keys()
    if len(neighbors) == 1: return 0.0
    links = 0
    for w in neighbors:
        for u in neighbors:
            if u in G[w]: links += 0.5
     return 2.0*links/(len(neighbors)*(len(neighbors)-1))

print G
# {'RDU': {'ATL', 'PHL'}, 'PHL': {'RDU', 'PIT', 'PVD'}, 'ATL': {'RDU', 'PIT'}, 
#'LAX': {'ORD', 'DFW', 'SEA'}, 'PVD': {'PHL'}, 'SEA': {'ORD', 'LAX'},
# 'ORD': {'PIT', 'DFW', 'LAX', 'SEA'}, 'PIT': {'ORD', 'ATL', 'PHL'}, 'DFW': {'ORD', 'LAX'}}
cc = clustering_coefficient(G,"ORD")
print cc #0.3333

total = 0
for v in G.keys():
    cc = clustering_coefficient(G,v)
    print v, cc,total
    total += cc
# RDU 0, PHL 0, ATL 0, LAX 0.666, SES 1, ORD 0.333, PIT 0, DFW 1
DFW 1.0 2.0
0.333333333333
print total/len(G) # 0.3333

Connected Components
connectedcompoents– Is everything connected?
– Is a set of nodes isolated?
– Can all nodes communicate?

connections = [("a", "g"), ("a", "d"), ('d', 'g'), ('g', 'c'),
           ('b', 'f'), ('f', 'e'), ('e', 'h')]

G = {}
for (x,y) in connections: make_link(G,x,y)
#G => {'a': {'d': 1, 'g': 1}, 'c': {'g': 1}, 'b': {'f': 1}, 'e': {'h': 1, 'f': 1}, 'd': {'a': 1, 'g': 1}, 
#'g': {'a': 1, 'c': 1, 'd': 1}, 'f': {'b': 1, 'e': 1}, 'h': {'e': 1}}
def mark_componet(G,node, marked):
    total_marked = 1
    marked[node]=True
    for neighbor in G[node]:
            if neighbor not in marked:
                total_marked+= mark_componet(G, neighbor, marked)
    return   total_marked

def list_component_sizes(G):
    marked={}
    for node in G.keys():
        if node not in marked:
            cc = mark_componet(G, node, marked)
            print "Component Containing", node, " : ", cc, marked

list_component_sizes(G)
#Component Containing a  :  4 {'a': True, 'c': True, 'd': True, 'g': True}
a -> (d,g) -> d ->(a,g) -> g->(a,c,d) -> c->(g)
#Component Containing b  :  4 {'a': True, 'c': True, 'b': True, 'e': True, 'd': True, 'g': True, 'f': True, 'h': True}

Running time of this algorithm

Graph search – Depth first search and Breadth first search.

Depth first search algorithm – The actual running time is 0(n + m), this is linear in the size of the graph. n : nodes and m: edges

Pairwise shortest path problem – http://oracleofbacon.org

Compare depth and breadth first search
3_04_depth_vs_breadth

Depth-first search without recursion – a new data structure called the open list – The open list, it as a kind of a “to do list” because it keeps track of what it is I need to do next and essentially
3_05_depth first without recursionOpen list – Put p onto the open list, mark it visited, so the neighbors of p in this case are r, s, and q.
So, we’re going to add r, s, and q to the open list and mark them all as visited.
Go to the graph, find all the neighbors of q in this case just t, mark it and add it to the “to do list.”

Depth first search with recursion
def mark_component(G, node, marked):
    marked[node] = True
    total_marked = 1
    for neighbor in G[node]:
        if neighbor not in marked:
            total_marked += mark_component(G, neighbor, marked)
    return total_marked

marked ={}
def check_connection(G, v1, v2):
    tm = mark_component(G, v1, marked)
    return v2 in marked
    
edges = [('a', 'g'), ('a', 'd'), ('g', 'c'), ('g', 'd'), ('b', 'f'), ('f', 'e'), ('e', 'h')]
G = {} 
# {'a': {'d': 1, 'g': 1}, 'c': {'g': 1}, 'b': {'f': 1}, 'e': {'h': 1, 'f': 1}, 'd': {'a': 1, 'g': 1}, 
#'g': {'a': 1, 'c': 1, 'd': 1}, 'f': {'b': 1, 'e': 1}, 'h': {'e': 1}}
for v1, v2 in edges:
  make_link(G, v1, v2
assert check_connection(G, "a", "c") == True #true total_marked=4, a-> (d,g)1,d ->(g)2,g->(c)3 c->4
assert check_connection(G, 'a', 'b') == False

3_06_depth first withour recursion_1 1d-2e,3c
3c-2e,4b
4b-2e,5a
5a-2e
2e-6f
6f-7g
7g

#Depth first search without recursion
# Rewrite `mark_component` to not use recursion 
# and instead use the `open_list` data structure 
def mark_component(G, node, marked):
    total_marked =1
    open_list=[node]
    marked[node] = True
    while(len(open_list)>0):
        current = open_list.pop()
        for neighbor in G[current]:
            if neighbor not in marked:
                total_marked+=1
                open_list.append(neighbor)
                marked[neighbor] = True
    return total_marked

test_edges = [(1, 2), (2, 3), (4, 5), (5, 6)]
G = {}
for n1, n2 in test_edges:
    make_link(G, n1, n2)
marked = {}
assert mark_component(G, 1, marked) == 3
assert 1 in marked
assert 4 not in marked

3_07_breath first withour recursion Grabbing the last element means you are using a “stack”. Grabbing the first means you are using a “queue”.1d-2e,3c
2e-3c,4f
3c-4f,5b
4f-5b,6g
5b-6g,7a
6g-7a
7a-

#Breath first search without recursion
import csv
def read_graph(filename):
    print filename
    #read an undirected graph in csv format, Each line is an edge
    tsv = csv.reader(open(filename), delimiter='\t')
    G={}
    for (node1,node2) in tsv: make_link(G, node1, node2)
    return G

#marvelG =read_graph("unqiue_edges.tsv")
marvelG =read_graph("ue.tsv")

def path(G, v1, v2):
    distance_from_start ={}
    open_list  =[v1]
    distance_from_start[v1]=[v1]
    while(len(open_list)>0) :
        current = open_list[0]
        del open_list[0]
        for neighbor in G[current]:
            if neighbor not in distance_from_start:
                distance_from_start[neighbor]=distance_from_start[current]+[neighbor]
                if neighbor==v2:
                    return distance_from_start[v2]
                open_list.append(neighbor)
    return False                       
    
from_node ="s"
to_node ="h"
print path(marvelG, from_node,  to_node)

3_01_centralityCentrality – A centrality algorithm that measures centrality by taking the average distance from a node to all the other nodes that it can reach.
3 Standard Centrality measures in a network:Degree, Closeness, Betweenness

centrality
centrality1
centrality2

closeness
between

#Average of the shortest path distance to each reachable node.
import csv
mg =read_graph("ue.tsv")
def centrality(G, v):
    distance_from_start={}
    open_list=[v]
    distance_from_start[v]=0
    while(len(open_list)>0):
        current = open_list[0]
        del open_list[0]
        for neighbor in G[current].keys():
            if neighbor not in distance_from_start:
                distance_from_start[neighbor]=distance_from_start[current]+1
                open_list.append(neighbor)
     #max(distance_from_start.values()) #2
     return (sum(distance_from_start.values())+0.0)/len(distance_from_start)                 

from_node ="s"

#len(distance_from_start) = 9 
#sum(distance_from_start.values() = 13
#distance_from_start = {'a': 1, 'c': 1, 'b': 2, 'e': 2, 'd': 2, 'g': 1, 'f': 2, 'h': 2, 's': 0}
print centrality(mg, from_node) #1.44444444444
import csv
import random
import operator
from collections import deque

def read_graph(filename):
    tsv = csv.reader(open(filename), delimiter='\t')
    G={}
    actors={}
    movies={}
    index = 0
    for (actor,movie, date) in tsv:
        m =     movie+", "+date
        if actor not in actors:
            actors[actor] = index
            index += 1
        if m not in movies:
            movies[m] = index
            index += 1
        make_link(G, actors[actor], movies[m])
    return (G, actors)

(imdb, acotrs) =read_graph("4_0010_imdb01.tsv")
print "imdb loaded"

#deque is 3 time faster than open list
def centrality(G, v):
    front, distance = deque([v]), {v: 0}
    while front:
        current = front.popleft()
        #print head
        for neighbor in G[current]:
            if neighbor not in distance:
                distance[neighbor] = distance[current] + 1
                front.append(neighbor)
    return (sum(distance.values())+0.0)/len(distance)

#open list
#def centrality(G, v):
#    the above code open list ...

def partition(L,n):
    l={}
    r={}
    m={}
    for v in L:
        if L[v] < L[n]: l[v]=L[v]
        elif L[v] > L[n]: r[v]=L[v]
        elif L[v] == L[n] : m[v]=L[v]
    return(l, m, r)

def topk(L,K):
    v = random.choice(L.keys())
    (left,middle, right) =  partition(L,v)
    if len(left)==K: return left
    elif len(left)+1==K: return dict(left, **middle)
    elif len(left)>=K: return topk(left,K)
    tmpd =dict(left, **middle)
    tkd =topk(right,K-len(left)-1)
    return dict(tmpd, **tkd)

print "centralities calcualtion"
centralities = {}
for acotr in acotrs:
    centralities[acotr] =centrality(imdb,acotrs[acotr])

print "sort centralities"
tk = topk(centralities,20)
sorted_x = sorted(tk.iteritems(), key=operator.itemgetter(1))
print sorted_x

'''
input
McClure, Marc (I)    Freaky Friday    2003
McClure, Marc (I)    Coach Carter    2005
McClure, Marc (I)    Superman II    1980
McClure, Marc (I)    Apollo 13    1995
McClure, Marc (I)    Superman    1978
McClure, Marc (I)    Back to the Future    1985
McClure, Marc (I)    Back to the Future Part III    1990

deque passed actor int 75.7587950472
passed actor int 194.672884684
passed actor string 219.399739313

output
[('Tatasciore, Fred', 3.6174691202419966), 
('Jackson, Samuel L.', 3.699142929165616),
 ('Welker, Frank', 3.7147718679102595), 
('Harnell, Jess', 3.7344340811696495), 
('Willis, Bruce', 3.744013108142173),
 ('Hanks, Tom', 3.746281825056718), 
('Blum, Steve (IX)', 3.763675321401563), 
('Papajohn, Michael', 3.771237711116713), 
('Voight, Jon', 3.7714897907738845), 
('Starr, Arne', 3.7722460297453995), 
('Damon, Matt', 3.7760272246029745), 
('Cruise, Tom', 3.776279304260146), 
('Downes, Robin Atkin', 3.777035543231661), 
('Abernathy, Don', 3.7782959415175195), 
('Wilson, Owen (I)', 3.7820771363750945), 
('Pitt, Brad', 3.787118729518528), 
('Baldwin, Alec', 3.7888832871187295), 
('Diaz, Cameron', 3.7896395260902445), 
('North, Nolan', 3.793924880262163), 
('Hoffman, Dustin', 3.794429039576506)]

'''

Bridge Edges
The bridges of a connected graph are the graph edges whose removal disconnects the graph. A bridge is an edge of a graph whose removal increases the number of components of graph. A graph containing one or more bridges is said to be a bridged graph, while a graph containing no bridges is called a bridgeless graph. Every edge of a tree is a bridge.

For example, In some sobridge_edgescial networks, it’s useful to know, are there any edges in this graph stop (disconnect)? Would be disconnection in the network as a whole? Or are still connected?

So 1st is rewritten the graph as a tree and 2nd is post order the nodes.
Building Trees – any edges that don’t really fit that pattern put in that dotted line.
Post Ordering – Assigning a number (sequential) to each of the nodes in the tree. All the children on its left are numbered. All the children on its right are numbered, children and descendant.

Finding Bridge Edges
finding_bridge_edgesBlue number – Compute the number of descendents (ND) for each node in the graph, that are either the node or reachable from the node by following green (line) edges only. For example, F has just the node itself and no descendants = 1, same thing with G. E has one descendant + 2 one descendent = 3,… a=7

L post-order value that is the smallest, For example, F(2) has itself and it also can reach G(3), the smallest value is 2. G(3) has no descendants but it can reach by one red edge(dotted line) F(2) = 2. E(4) has F(2), G(2) and itself that it can reach no other nodes by non-tree edges (dotted line) = 1. D has itself and a non-tree edge (dotted line) b(1)= 1. C has itself and one tree edge d(1) = 1. B(1) itself =1. A includes everything= 1.

H- the largest value, For example, G between F(2) and G(3) the largest value is 3. E => E(4), F(3), G(3) = 4. D => D(5),E(4),F(2), G(3) and B(1) reachable by a red edge = 5. C =6, A set contains everybody = 7.

Build rooted spanning out of graph

#Bridge Edges

#graph = [('a','b'),('a','c'),('b','d'),('c','d'),('d','e'),('e','f'),('e','g'),('f','g')]
#
# grpG = {'a': {'c': 1, 'b': 1}, 
#      'b': {'a': 1, 'd': 1}, 
#      'c': {'a': 1, 'd': 1}, 
#      'd': {'c': 1, 'b': 1, 'e': 1}, 
#      'e': {'d': 1, 'g': 1, 'f': 1}, 
#      'f': {'e': 1, 'g': 1},
#      'g': {'e': 1, 'f': 1} 
#      }
# 
# rewritten as a spanning tree
# S = {'a': {'c': 'green', 'b': 'green'}, 
#      'b': {'a': 'green', 'd': 'red'}, 
#      'c': {'a': 'green', 'd': 'green'}, 
#      'd': {'c': 'green', 'b': 'red', 'e': 'green'}, 
#      'e': {'d': 'green', 'g': 'green', 'f': 'green'}, 
#      'f': {'e': 'green', 'g': 'red'},
#      'g': {'e': 'green', 'f': 'red'} 
#      }

def make_link(G, node1, node2, r_or_g_or_1):
    if node1 not in G:
        G[node1] = {}
    (G[node1])[node2] = r_or_g_or_1
    if node2 not in G:
        G[node2] = {}
    (G[node2])[node1] = r_or_g_or_1
    return G

graph = [('a','b'),('a','c'),('b','d'),('c','d'),('d','e'),('e','f'),('e','g'),('f','g')]
grpG={}
for s, e in graph:
    make_link(grpG, s, e, 1)

#DFS
def create_rooted_spanning_tree(G, root):
    open_list = [root]
    S = {root:{}}
    while len(open_list) > 0:
        node = open_list.pop()
        neighbors = G[node]
        for n in neighbors:
            if n not in S:
                # not seen this node, use a green edge to connect it
                make_link(S, node, n, 'green')
                open_list.append(n)
            else:
                # seen this node, verify not have the edge in S
                if node not in S[n]: make_link(S, node, n, 'red')
    return S

#mulitple possible solutions
def test_create_rooted_spanning_tree():
    S = create_rooted_spanning_tree(grpG, "a")
    #this is one possible solution
    assert S == {'a': {'c': 'green', 'b': 'green'}, 
                 'b': {'a': 'green', 'd': 'red'}, 
                 'c': {'a': 'green', 'd': 'green'}, 
                 'd': {'c': 'green', 'b': 'red', 'e': 'green'}, 
                 'e': {'d': 'green', 'g': 'green', 'f': 'green'}, 
                 'f': {'e': 'green', 'g': 'red'},
                 'g': {'e': 'green', 'f': 'red'} 
                 }

test_create_rooted_spanning_tree()

Generate Pre Order nodes

def get_children(S, root, parent):
    #returns the children from following the green edges
    return [n for n, e in S[root].items() if ((not n == parent) and (e == 'green'))]

def _post_order(S, root, parent, val, po):
    children = get_children(S, root, parent)
    for c in children: val = _post_order(S, c, root, val, po)
    po[root] = val
    return val + 1

def post_order1(S, root):
    po = {}
    _post_order(S, root, None, 1, po)
    return po

def test_post_order():
    S = create_rooted_spanning_tree(grpG, "a")
    po = post_order2(S, 'a')
    assert po == {'a':7, 'b':1, 'c':6, 'd':5, 'e':4, 'g':2, 'f':3}

test_post_order()

Calculate Number of Descendants

def _number_descendants(S, root, parent, nd):
    children = get_children(S, root, parent)
    nd_val = 1
    for c in children: nd_val += _number_descendants(S, c, root, nd)
    nd[root] = nd_val
    return nd_val

def number_of_descendants2(S, root):
    nd = {}
    _number_descendants(S, root, None, nd)
    return nd

def test_number_of_descendants():
    S = create_rooted_spanning_tree(grpG, "a")
    nd = number_of_descendants1(S, 'a')
    #nd = number_of_descendants2(S, 'a')
    print nd
    assert nd == {'a':7, 'b':1, 'c':5, 'd':4, 'e':3, 'f':1, 'g':1}

test_number_of_descendants()

Generate Lowest Post Order

def get_children_all(S, root, parent):
    #split green & red edges
    green = []
    red = []
    for n, e in S[root].items():
        if n == parent: continue
        if e == 'green': green.append(n)
        if e == 'red': red.append(n)
    return green, red

def _general_post_order(S, root, parent, po, gpo, comp):
    green, red = get_children_all(S, root, parent)
    val = po[root]
    for c in green:
        test = _general_post_order(S, c, root, po, gpo, comp)
        if comp(val, test): val = test
    for c in red:
        test = po[c]
        if comp(val, test): val = test
    gpo[root] = val
    return val

def lowest_post_order(S, root, po):
    lpo = {}
    _general_post_order(S, root, None, po, lpo, lambda x, y: x > y) 
    return lpo

def test_lowest_post_order():
    S = create_rooted_spanning_tree(grpG, "a")
    po = post_order(S, 'a')
    l = lowest_post_order(S, 'a', po)
    assert l == {'a':1, 'b':1, 'c':1, 'd':1, 'e':2, 'f':2, 'g':2}

test_lowest_post_order()

Generate Highest Post Order

def highest_post_order(S, root, po):
    hpo = {}
    _general_post_order(S, root, None, po, hpo, lambda x, y: x < y)
    return hpo

def test_highest_post_order():
    S = create_rooted_spanning_tree(grpG, "a")
    po = post_order(S, 'a')
    l = highest_post_order(S, 'a', po)
    assert h == {'a':7, 'b':5, 'c':6, 'd':5, 'e':4, 'f':3, 'g':3}

test_highest_post_order()

Bridge Edges

def bridge_edges(G, root):
    S = create_rooted_spanning_tree(G, root)
    po = post_order(S, root)
    nd = number_of_descendants(S, root)
    lpo = lowest_post_order(S, root, po)
    hpo = highest_post_order(S, root, po)
    
    open_list=[(root,None)]
    bg=[]
    while len(open_list)>0:
        cur,parent = open_list.pop(0)
        n= [c for c in S[cur] if S[cur][c]=='green' and c != parent] 
        print cur, n
        for e in n:
            if hpo[e]po[e]-nd[e]: bg.append((cur,e))
            open_list.append((e,cur))
    return bg

bridges = bridge_edges(grpG, 'a')
print bridges
assert bridges == [('d', 'e')]

Bipartite Graphs
3_13_bipartite_graphs
A bipartite graph is a graph whose nodes can be divided into two sets U (characters) and V (comic books). Every edge in the graph connects a node from U to one in V. This means there are no direct connections between the nodes in each set. To get from a node in U to another node in U, you’ll have to visit a node in V.

3_14_bipartite_graphs_example

#find highest strength value

import csv
from collections import defaultdict

def make_link(G, node1, node2):
    if node1 not in G: G[node1] = defaultdict(int)
    (G[node1])[node2] += 1
    if node2 not in G: G[node2] = defaultdict(int)
    (G[node2])[node1] += 1
    return G

def read_graph(filename):
    tsv = csv.reader(open(filename), delimiter='\t')
    G, characters = {}, {}
    index = 0
    for (char, comic) in tsv: 
        if char not in characters:
            characters[char]=1
        make_link(G, char, comic)
    return G, characters
    

#marvelG, marvelCharG = read_graph('unqiue_edges.tsv')
marvelG, marvelCharG = read_graph('ue1.tsv')
charG = {}
highest = (-1, 'none', 'none')
for char1 in marvelCharG:
    charG[char1] = defaultdict(int)
    for comic in marvelG[char1]:
        for char2 in marvelG[comic]:
            if char1!=char2: (charG[char1])[char2] += 1
    for char2 in charG[char1]:
        if (charG[char1][char2]>highest[0]):
            highest = (charG[char1][char2],char1,char2)
print highest
'''
A	1
B	1
B	2
C	1
C	2
C	3
D	1
D	2
D	3
D	4

{'A': 0, 'C': 2, 'B': 1, 'D': 3}

'A': {'1': 1}), 
'C': {'1': 1, '3': 1, '2': 1}), 
'B': {'1': 1, '2': 1}), 
'D': {'1': 1, '3': 1, '2': 1, '4': 1}), 
'1': {'A': 1, 'C': 1, 'B': 1, 'D': 1}), 
'3': {'C': 1, 'D': 1}), 
'2': {'C': 1, 'B': 1, 'D': 1}), 
'4': {'D': 1})

'A': {'C': 1, 'B': 1, 'D': 1}), 
'C':  {'A': 1, 'B': 2, 'D': 3}), 
'B':  {'A': 1, 'C': 2, 'D': 2}), 
'D':  {'A': 1, 'C': 3, 'B': 2})

a - a c b d
b - a c  d c  d
'''

Advertisement

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: