Blog of Sara Jakša

The Characters from Arrowverse Appearing in the Same Stories in Fanfiction

I am (most likely) going to be analyzing fanfiction data for my master thesis. Since I already had this data avalable, I decided to try and see if I can come up with some interesting analysis.

One of the things, that I am interested in, is the relationships between people. I wanted to see which people appear together in the stories. For this, I used the tags of the stories and try to analyse when do the appear together.

import sqlite3
import os
import re
import bs4
import pandas
import networkx
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy
import community
import json
database_file_name = "sqldata_arrowverse.sql"
folder_with_stories = "data"

First I needed to get all the character tags from the database (that I had collected beforehand).

re_remove_middle_names = r'(".*?")'
sql_database = sqlite3.connect(database_file_name)
cursor = sql_database.cursor()
cursor.execute("""DROP VIEW all_tags;""")
cursor.execute("""
CREATE VIEW all_tags AS SELECT work, tag FROM tags WHERE category='Character' AND tag IN 
(SELECT tag
FROM Tags 
WHERE category="Character"
AND tag NOT IN ('Jason Todd', 'Alfred Pennyworth', 'James "Bucky" Barnes', 'Team Legends', 'Rogues',
'OC - Character', 'Sam Winchester', 'Sebastian Smythe', 'Stiles Stilinski', 'Barbara Gordon',
'Original Character', 'Dawn Allen', 'Dean Winchester', 'Clint Barton', 'Hal Jordan', 'Tony Stark', 
'Steve Rogers', 'Dick Grayson', 'Original Child Character(s)', 'Original Male Character(s)',
'Diana (Wonder Woman)', 'You', 'Bruce Wayne', 'Reader', 'Original Female Character(s)',
'Original Characters', 'Jason Todd', 'Batman', 'Selina Kyle', 'Original Metahuman Character',
'Team Flash', 'Team Flash (The Flash TV 2014)', 'Team Legends (DC''s Legends of Tomorrow)', 
'Original Metahuman Character(s)', 'Rogues (The Flash)')
GROUP BY tag 
HAVING count(tag) > 99
ORDER BY count(tag) DESC);""")
cursor.execute("""SELECT t1.tag as tag1, t2.tag as tag2, count(*) 
FROM all_tags t1 
INNER JOIN all_tags t2 ON t2.work = t1.work
AND tag1<>tag2
GROUP BY t1.tag, t2.tag
ORDER BY count(*) DESC;""")
tags_together = cursor.fetchall()
cursor.execute("SELECT id FROM work")
works_number = len(cursor.fetchall())
cursor.execute("""SELECT tag, count(*) FROM tags 
WHERE category='Character' AND tag IN (SELECT tag FROM all_tags) 
GROUP BY tag""")
tags_number_by_person = cursor.fetchall()
sql_database.close()
len(tags_together)
19110

Since some characters had mutiple ways, that they can be refered to (it is a superhero show, so a lot of people have at least a cuperhero name), I am doing so preprocessing in order to deal with this.

combine_people_dict = {"The Flash - Character": "Barry Allen", 
                       "Killer Frost": "Caitlin Snow", 
                       "Harrison Wells | Eobard Thawne": "Eobard Thawne",
                       "Eobard Thawne | Harrison Wells": "Eobard Thawne",
                       "Zari Adrianna Tomaz": "Zari Tomaz",
                       "Supergirl - Character": "Kara Danvers",
                       "Kara Zor-El": "Kara Danvers",
                       "Alura In-Ze | Alura Zor-El": "Alura Zor-El",
                       "Jimmy Olsen": "James Olsen",
                       "J'onn J'onzz | Hank Henshaw": "J'onn J'onzz",
                       "Hank Henshaw | J'onn J'onzz": "J'onn J'onzz",
                       "mon-el": "Mon-El",
                       "Harry Wells": "Earth-2 Harrison Wells",
                       "Jay Garrick | Hunter Zolomon": "Zoom",
                       "Winn Schott Jr.": "Winn Schott",
                       "Captain Cold": "Leonard Snart",
                       "Jess the Secretary": "Jess"}
tags_together_dict = dict()
for person1, person2, count in tags_together:
    if person1 == 'Harrison "Harry" Wells':
        person1 = "Earth-2 Harrison Wells"
    if person2 == 'Harrison "Harry" Wells':
        person2 = "Earth-2 Harrison Wells"        
    person1 = person1.split("(")[0].strip()
    person2 = person2.split("(")[0].strip()
    string_to_remove_1 = re.findall(re_remove_middle_names, person1)
    string_to_remove_2 = re.findall(re_remove_middle_names, person2)
    if string_to_remove_1:
        string_to_remove_1 = string_to_remove_1[0]
        person1 = person1[:person1.index(string_to_remove_1) - 1] + person1[person1.index(string_to_remove_1) + len(string_to_remove_1):]
    if string_to_remove_2:
        string_to_remove_2 = string_to_remove_2[0]
        person2 = person2[:person2.index(string_to_remove_2) - 1] + person2[person2.index(string_to_remove_2) + len(string_to_remove_2):]
    if person1 in combine_people_dict:
        person1 = combine_people_dict[person1]
    if person2 in combine_people_dict:
        person2 = combine_people_dict[person2]
    if not person1 in tags_together_dict:
        tags_together_dict[person1] = dict()
    if not person2 in tags_together_dict[person1]:
        tags_together_dict[person1][person2] = 0
    tags_together_dict[person1][person2] += count
len(tags_together_dict.keys())
137
tags_person_dict = dict()
for person, count in tags_number_by_person:
    tags_person_dict[person] = count

So, now then I did the preprocesing of people and connection, I have my first data. And this is, in how many stories a character appears. Kara seems to be the most popular.

tags_person_pandas = pandas.DataFrame.from_dict(tags_person_dict, orient="index", columns=["Count"])
tags_person_pandas.reset_index(level=0, inplace=True)
tags_person_pandas.sort_values("Count", ascending=False, inplace=True)
tags_person_pandas.head(10)
index Count
80 Kara Danvers 17055
118 Oliver Queen 15330
10 Barry Allen 14777
45 Felicity Smoak 13503
2 Alex Danvers 12858
87 Lena Luthor 9482
88 Leonard Snart 8789
140 Sara Lance 8125
19 Cisco Ramon 8112
12 Caitlin Snow 6792
all_relationships = []
for person1 in tags_together_dict:
    for person2 in tags_together_dict[person1]:
        all_relationships.append(tuple([person1, person2, {"weight": tags_together_dict[person1][person2]}]))

So now that we have this, let us try to vizualize the whole network of people.

S = networkx.Graph()
S.add_nodes_from([a for a in tags_together_dict])
S.add_edges_from(all_relationships)
len(S.nodes())
137
plt.figure(1,figsize=(20,20)) 
networkx.draw(S, 
        with_labels=True, 
        pos=networkx.spring_layout(S), 
        font_weight='bold', 
        node_color="yellow", 
        width=3, 
        arrows=True, 
        node_size=2000,
        edge_color = numpy.linspace(0,1,len(S.edges()))
       )
/usr/lib/python3.7/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number)
  if cb.is_numlike(alpha):

png

Even limiting to just characters that appear in at least 100 stories (which is between 0.1% and 0.2% of stories), there is not a lot of things that can be seen from the graph. So the next point is to also limit it to the connections, that exist in at least 100 stories.

lowest_weight = 100
S = networkx.Graph()
S.add_edges_from([a for a in all_relationships if a[2]["weight"] > lowest_weight])
plt.figure(1,figsize=(30,30)) 
networkx.draw(S, 
        with_labels=True, 
        pos=networkx.spring_layout(S), 
        font_weight='bold', 
        node_color="yellow", 
        width=3, 
        arrows=True, 
        node_size=2000,
        edge_color = numpy.linspace(0,1,len(S.edges()))
             )

png

Looking at the graph above, there seems to be a clear devide between the Supergirl and the rest of the series. The other three series have some differences, but there seems to be a lot more connections between them. Which is interesting, it is like the audience for Supergirl is different than for the other three shows.

In the next part, I want to see which characters are the most influential.

pagerank = pandas.DataFrame.from_dict(networkx.pagerank(S, weight='weight'), orient="index", columns=["PageRank"])
pagerank.reset_index(level=0, inplace=True)
centrality = pandas.DataFrame.from_dict(networkx.degree_centrality(S), orient="index", columns=["Centrality"])
centrality.reset_index(level=0, inplace=True)
betweenes = pandas.DataFrame.from_dict(networkx.betweenness_centrality(S, weight='weight'), orient="index", columns=["Between"])
betweenes.reset_index(level=0, inplace=True)
ranking = pagerank.merge(centrality, left_on='index', right_on='index')
ranking = ranking.merge(betweenes, left_on='index', right_on='index')

The first one that I want to see is the PageRank. This is the one that takes the strength of connecting nodes in account when calculating it. So, somebody, that is not well connected, but is connected to well connected people could still have a high page rank.

In this regard, the three of the four leads are at the top (Legends of Tomorrow is the one missing). Kara is leading in this one.

ranking.sort_values("PageRank", ascending=False).head(10)
index PageRank Centrality Between
25 Kara Danvers 0.057478 0.488 0.117742
7 Barry Allen 0.050267 0.592 0.097677
1 Oliver Queen 0.048903 0.512 0.116387
0 Felicity Smoak 0.040533 0.440 0.065419
34 Alex Danvers 0.040261 0.352 0.058516
10 Cisco Ramon 0.034321 0.464 0.032194
5 Sara Lance 0.032335 0.440 0.052516
9 Caitlin Snow 0.031984 0.488 0.070516
48 Lena Luthor 0.029198 0.288 0.079419
21 Leonard Snart 0.029180 0.360 0.025613

The next one is centrality, which just means that a person has a lot of relationships with other characters. The same three people (four, since Kara is sharing her spot with Caitlin) are at the top. But people like Caitlin and Cisco rised up, while people like Felicity Smoak fell down. Some people like Lena Luthor and Alex Danvers are no longer there (both from Supergirl), while people like Mick Rory and Iris West appeared (both from Flash).

ranking.sort_values("Centrality", ascending=False).head(10)
index PageRank Centrality Between
7 Barry Allen 0.050267 0.592 0.097677
1 Oliver Queen 0.048903 0.512 0.116387
25 Kara Danvers 0.057478 0.488 0.117742
9 Caitlin Snow 0.031984 0.488 0.070516
10 Cisco Ramon 0.034321 0.464 0.032194
12 Iris West 0.028664 0.448 0.124968
0 Felicity Smoak 0.040533 0.440 0.065419
5 Sara Lance 0.032335 0.440 0.052516
21 Leonard Snart 0.029180 0.360 0.025613
23 Mick Rory 0.025047 0.360 0.030258

And the last one is betweeness. It is the bridge between different clusters and removing this node would increase the path of other people. And where Iris West don't really makes sense.

ranking.sort_values("Between", ascending=False).head(10)
index PageRank Centrality Between
12 Iris West 0.028664 0.448 0.124968
25 Kara Danvers 0.057478 0.488 0.117742
1 Oliver Queen 0.048903 0.512 0.116387
7 Barry Allen 0.050267 0.592 0.097677
48 Lena Luthor 0.029198 0.288 0.079419
9 Caitlin Snow 0.031984 0.488 0.070516
0 Felicity Smoak 0.040533 0.440 0.065419
34 Alex Danvers 0.040261 0.352 0.058516
64 James Olsen 0.015777 0.232 0.055677
5 Sara Lance 0.032335 0.440 0.052516

Because Iris does not makes much sense, I calculated the unweighted betweeness. Kara makes a lot more sense, since she is the connecting node between her Earth-38 and Earth-1. This is also why Oliver and Bary makes sense, since they were the ones usually going together with Kara.

betweenes_2 = pandas.DataFrame.from_dict(networkx.betweenness_centrality(S), orient="index", columns=["Between2"])
betweenes_2.reset_index(level=0, inplace=True)
ranking = ranking.merge(betweenes_2, left_on='index', right_on='index')
ranking.sort_values("Between2", ascending=False).head(10)
index PageRank Centrality Between Between2
25 Kara Danvers 0.057478 0.488 0.117742 0.210268
1 Oliver Queen 0.048903 0.512 0.116387 0.141382
7 Barry Allen 0.050267 0.592 0.097677 0.140181
10 Cisco Ramon 0.034321 0.464 0.032194 0.070581
34 Alex Danvers 0.040261 0.352 0.058516 0.069021
5 Sara Lance 0.032335 0.440 0.052516 0.065604
0 Felicity Smoak 0.040533 0.440 0.065419 0.062297
9 Caitlin Snow 0.031984 0.488 0.070516 0.061614
12 Iris West 0.028664 0.448 0.124968 0.058943
48 Lena Luthor 0.029198 0.288 0.079419 0.045495

Next, I wanted to try if the algoritm could find any communities in the data. And looking at the picture below, it did a lot better job than expected. Most people got correctly assigned to the series they appear in.

plt.figure(1,figsize=(30,30)) 
partition = community.best_partition(S, weight="weight")
size = (len(set(partition.values())))
pos = networkx.spring_layout(S)
count = 0
colors = [cm.jet(x) for x in numpy.linspace(0, 1, size)]
labels = {node[0]: node[0] for node in S.nodes(data=True)}
for com in set(partition.values()):
    list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
    networkx.draw_networkx_nodes(S, pos, list_nodes, node_size = 2000, node_color=colors[count])
    networkx.draw_networkx_labels(S, pos, labels, font_size=20, alpha=0.8)
    count = count + 1
networkx.draw_networkx_edges(S, pos)
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
/usr/lib/python3.7/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number)
  if cb.is_numlike(alpha):





<matplotlib.collections.LineCollection at 0x7f0495492470>

png

Here is the function, to save the graph above in the JSON file for vizualization on the website.

all_characters_as_numbers = dict()
for i, name in enumerate(S.nodes()):
    all_characters_as_numbers[name] = i
nodes = [{'name': all_characters_as_numbers[i], 'label': i, 'fandom': str(partition[i])} for i in S.nodes()]
links = [{'source': all_characters_as_numbers[u[0]], 'target': all_characters_as_numbers[u[1]]} for u in S.edges()]
with open('graph2.json', 'w') as f:
    json.dump({'nodes': nodes, 'links': links},
              f, indent=4,)

Now I wanted to try and vizualize the different communities. I put a higher limit on it, so people are only present, if they appear in at least 500 stories. This makes graphs more understandable.

lowest_weight_subgraph = 500
nodes_arrow = {item: group for item, group in partition.items() if group == 0}
S = networkx.Graph()
S.add_edges_from([a for a in all_relationships if a[1] in nodes_arrow and a[0] in nodes_arrow and a[2]['weight'] > lowest_weight_subgraph])
plt.figure(1,figsize=(30,30)) 
partition_arrow = community.best_partition(S, weight="weight")
size = (len(set(partition_arrow.values())))
pos = networkx.spring_layout(S)
count = 0
colors = [cm.jet(x) for x in numpy.linspace(0, 1, size)]
labels = {node[0]: node[0] for node in S.nodes(data=True)}
for com in set(partition_arrow.values()):
    list_nodes = [nodes for nodes in partition_arrow.keys()
                                if partition_arrow[nodes] == com]
    networkx.draw_networkx_nodes(S, pos, list_nodes, node_size = 2000, node_color=colors[count])
    networkx.draw_networkx_labels(S, pos, labels, font_size=20, alpha=0.8)
    count = count + 1
networkx.draw_networkx_edges(S, pos)
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.





<matplotlib.collections.LineCollection at 0x7f049743ba90>

png

nodes_lot = {item: group for item, group in partition.items() if group == 1}
S = networkx.Graph()
S.add_edges_from([a for a in all_relationships if a[1] in nodes_lot and a[0] in nodes_lot and a[2]["weight"] > lowest_weight_subgraph])
plt.figure(1,figsize=(30,30)) 
partition_lot = community.best_partition(S, weight="weight")
size = (len(set(partition_lot.values())))
pos = networkx.spring_layout(S)
count = 0
colors = [cm.jet(x) for x in numpy.linspace(0, 1, size)]
labels = {node[0]: node[0] for node in S.nodes(data=True)}
for com in set(partition_lot.values()):
    list_nodes = [nodes for nodes in partition_lot.keys()
                                if partition_lot[nodes] == com]
    networkx.draw_networkx_nodes(S, pos, list_nodes, node_size = 2000, node_color=colors[count])
    networkx.draw_networkx_labels(S, pos, labels, font_size=20, alpha=0.8)
    count = count + 1
networkx.draw_networkx_edges(S, pos)
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.





<matplotlib.collections.LineCollection at 0x7f049772beb8>

png

nodes_flash = {item: group for item, group in partition.items() if group == 2}
S = networkx.Graph()
S.add_edges_from([a for a in all_relationships if a[1] in nodes_flash and a[0] in nodes_flash and a[2]["weight"] > lowest_weight_subgraph])
plt.figure(1,figsize=(30,30)) 
partition_flash = community.best_partition(S, weight="weight")
size = (len(set(partition_flash.values())))
pos = networkx.spring_layout(S)
count = 0
colors = [cm.jet(x) for x in numpy.linspace(0, 1, size)]
labels = {node[0]: node[0] for node in S.nodes(data=True)}
for com in set(partition_flash.values()):
    list_nodes = [nodes for nodes in partition_flash.keys()
                                if partition_flash[nodes] == com]
    networkx.draw_networkx_nodes(S, pos, list_nodes, node_size = 2000, node_color=colors[count])
    networkx.draw_networkx_labels(S, pos, labels, font_size=20, alpha=0.8)
    count = count + 1
networkx.draw_networkx_edges(S, pos)
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.





<matplotlib.collections.LineCollection at 0x7f0494829cf8>

png

nodes_supergirl = {item: group for item, group in partition.items() if group == 3}
S = networkx.Graph()
S.add_edges_from([a for a in all_relationships if a[1] in nodes_supergirl and a[0] in nodes_supergirl and a[2]["weight"] > lowest_weight_subgraph])
plt.figure(1,figsize=(30,30)) 
partition_supergirl = community.best_partition(S, weight="weight")
size = (len(set(partition_supergirl.values())))
pos = networkx.spring_layout(S)
count = 0
colors = [cm.jet(x) for x in numpy.linspace(0, 1, size)]
labels = {node[0]: node[0] for node in S.nodes(data=True)}
for com in set(partition_supergirl.values()):
    list_nodes = [nodes for nodes in partition_supergirl.keys()
                                if partition_supergirl[nodes] == com]
    networkx.draw_networkx_nodes(S, pos, list_nodes, node_size = 2000, node_color=colors[count])
    networkx.draw_networkx_labels(S, pos, labels, font_size=20, alpha=0.8)
    count = count + 1
networkx.draw_networkx_edges(S, pos)
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.





<matplotlib.collections.LineCollection at 0x7f0494787ef0>

png

For the end, I played a little with vizualization for Javascript, mostly to be able to eventually put the vizualization directlly in the blog, as a part of story telling. Here is the code and current result.

%%html
<div id="d3-example"></div>
<style>
.node {stroke: #fff; stroke-width: 1.5px;}
.link {stroke: #999; stroke-opacity: .6;}
</style>
%%javascript
// We load the d3.js library from the Web.
require.config({paths:
    {d3: "http://d3js.org/d3.v3.min"}});
require(["d3"], function(d3) {
  // The code in this block is executed when the
  // d3.js library has been loaded.

  // First, we specify the size of the canvas
  // containing the visualization (size of the
  // <div> element).
  var width = 500, height = 500;

  // We create a color scale.
  var color = d3.scale.category10();

  // We create a force-directed dynamic graph layout.
  var force = d3.layout.force()
    .charge(-100)
    .linkDistance(100)
    .size([width, height]);

  // In the <div> element, we create a <svg> graphic
  // that will contain our interactive visualization.
  var svg = d3.select("#d3-example").select("svg")
  if (svg.empty()) {
    svg = d3.select("#d3-example").append("svg")
          .attr("width", width)
          .attr("height", height);
  }

  // We load the JSON file.
  d3.json("graph2.json", function(error, graph) {
    // In this block, the file has been loaded
    // and the 'graph' object contains our graph.

    // We load the nodes and links in the
    // force-directed graph.
    force.nodes(graph.nodes)
      .links(graph.links)
      .start();

    // We create a <line> SVG element for each link
    // in the graph.
    var link = svg.selectAll(".link")
      .data(graph.links)
      .enter().append("line")
      .attr("class", "link");

    // We create a <circle> SVG element for each node
    // in the graph, and we specify a few attributes.
    var node = svg.selectAll(".node")
      .data(graph.nodes)
      .enter().append("circle")
      .attr("r", 5)  // radius
      .style("fill", function(d) {
         // The node color depends on the club.
         return color(d.fandom);
      })
      .call(force.drag);

    // We bind the positions of the SVG elements
    // to the positions of the dynamic force-directed
    // graph, at each time step.
    force.on("tick", function() {
      link.attr("x1", function(d){return d.source.x})
          .attr("y1", function(d){return d.source.y})
          .attr("x2", function(d){return d.target.x})
          .attr("y2", function(d){return d.target.y});

      node.attr("cx", function(d){return d.x})
          .attr("cy", function(d){return d.y});
    });
  });
});
<IPython.core.display.Javascript object>