Blog of Sara Jakša

I Made a Detour With Tumblir By Calculating Tag Frequency

While I have been waiting for the next data to collect, I decided that I wanted to make a little detour. I remembered a article, that was calculating the frequency of the words for different The Big Five types, and they found some interesting differences. I took that and I tried to figure out, if the difference can also be seen in the usage of different tags.

The following code was used to get the frequency of the tags.

    allTags = set(["intp", "intj", "istp", "istj", "infp", "infj", "isfp", "isfj", "entp", "entj", "estj", "estp", "esfp", "esfj", "enfp", "enfj"])

    def getNeighborTag(singlefile):
        with open(singlefile, "r") as read:
            content = read.readlines()
            tagsDictionary = dict()
            for line in content[1:]:
                currentTags = line.split("\t")[2]
                currentTags = getListFromString(currentTags)
                currentTags = cleanTumblrTags(currentTags)
                for tag in currentTags:
                    if tag in allTags:
                        for tagNeighbor in currentTags:
                            if tagNeighbor == tag:
                                continue
                            if not tag in tagsDictionary:
                                tagsDictionary[tag] = dict()
                            if not tagNeighbor in tagsDictionary[tag]:
                                tagsDictionary[tag][tagNeighbor] = 0
                            tagsDictionary[tag][tagNeighbor] = tagsDictionary[tag][tagNeighbor] + 1
        return tagsDictionary

    def getFrequencyDictionary(allTags, singlefile, output):
        tagsDict = getNeighborTag(singlefile)
        for key in tagsDict.keys():
            tagsDict[key] = reverseFrequencyDictionary(tagsDict[key])
        with open(output, "w") as write:
            for key in tagsDict.keys():
                for freq, tags in tagsDict[key].items():
                    for tag in tags:
                        write.write(key + "\t" + tag + "\t" + str(freq) + "\n")
        return None

    def reverseFrequencyDictionary(inputDict):
        finalDict = dict()
        for key, value in inputDict.items():
            if not value in finalDict:
                finalDict[value] = set()
            finalDict[value].add(key)              
        return finalDict


    def getListFromString(listAsString):
        listAsString = listAsString[1:-1].split(", ")
        for i in range(len(listAsString)):
            if "u'" in listAsString[i]:
                listAsString[i] =  listAsString[i][2:-1]
        return listAsString

    def cleanTumblrTags(tagList):
        for i in range(len(tagList)):
            if "c: " in tagList[i]:
                tagList[i] = tagList[i][2:]
            tagList[i] = tagList[i].lower()
        return tagList

    getFrequencyDictionary(allTags, "input-file.csv", "output-file.csv")