Blog of Sara Jakša

Getting Blogs From Tumblir

Up until now, I was working on the list of Tumblir blogs by type of the writter. Now I wanted to get the text of their blog posts.

What I made sure here to no longer bother with the timeout because of the API restrictions, since the daily restrictions were always reached quite soon. And it did not matter if I run the script6 b6y hand, or if I had the time our function programmed in.

    import pytumblr
    import re
    import os
    import sys

    #this is a regex, to be able to get the url of the Tumblr blog from the url post
    tumblr_url = r"\w+.tumblr.com"
    blog_url = r"/(\d+)/([\w-]+)"
    name_after_url = r"/\d+/[\w-]+"
    digits_only = r"/(\d+)"

    offset = 0
    webpages = get_all_webpages("file.csv")
    write_set_to_file(webpages, "file.csv")
    url = list(webpages)[0]
    tag = ""

    # Authenticate via OAuth
    client = pytumblr.TumblrRestClient()

    def get_all_webpages(filename):
        allwebpages = set()
        with open(filename, "r") as read:
            content = read.readlines()
            for line in content:
                line = line.split("\t")
                allwebpages.add(line[0].strip())
        return allwebpages

    def write_set_to_file(settowrite, singlefile):
        with open(singlefile, "w") as write:
            for element in settowrite:
                write.write(element + "\n")
        return None

    def get_blog_post(mbti, url, offset):
        blogpost = client.posts(url, type='text', limit=20, offset=offset)

        try:
            if not blogpost["posts"]:
                print("Finished with blog: " + url)
                return "finished"

            url = blogpost["posts"][0]["post_url"]
            title = blogpost["posts"][0]["title"]
            body = blogpost["posts"][0]["body"]
        except:
            print(blogpost)
            if blogpost["meta"]["msg"] == "Limit Exceeded":
                print("Next offset for " + mbti + " is: " + str(offset))
                with open("offset", "w") as write:
                    write.write(str(offset))
                return None

        try:
            os.mkdir(mbti)
        except:
            pass

        print(url)
        if re.compile(name_after_url).search(url) is not None:
            blog_name = re.search(blog_url, url.strip())
            blog_name = blog_name.groups()
            blog_name = "-".join(blog_name)
        else:
            blog_name = re.search(digits_only, url.strip())
            blog_name = blog_name.groups()
            blog_name = "".join(blog_name)


        with open(mbti + "/" + blog_name + ".txt", "w") as write:
            if not title:
                title = ""
            if not body:
                body = ""
            title = title.encode("utf8")
            body = body.encode("utf8")
            write.write(title + "\n\n\n" + body)

        return offset + 20

    while 1:
        offset = get_blog_post(tag, url, offset)
        if offset == None:
            write_set_to_file(webpages, "blogsistp-cleaned-2.csv")
            sys.exit()
        if offset == "finished":
            webpages.remove(url)
            if not webpages:
                break
            url = list(webpages)[0]
            offset = 0
            write_set_to_file(webpages, "file-out.csv")

Because of this restriction, I decided to abandon this way of getting information, since it would take at least a month to get the amount of data that I wanted.

Since I planed to use this for the school presentation as well, that was not something that I could afford. So I decided to try and find a different source of data.