Getting Blogs From Tumblir
Up until now, I was working on the list of Tumblir blogs by type of the writter. Now I wanted to get the text of their blog posts.
What I made sure here to no longer bother with the timeout because of the API restrictions, since the daily restrictions were always reached quite soon. And it did not matter if I run the script6 b6y hand, or if I had the time our function programmed in.
import pytumblr import re import os import sys #this is a regex, to be able to get the url of the Tumblr blog from the url post tumblr_url = r"\w+.tumblr.com" blog_url = r"/(\d+)/([\w-]+)" name_after_url = r"/\d+/[\w-]+" digits_only = r"/(\d+)" offset = 0 webpages = get_all_webpages("file.csv") write_set_to_file(webpages, "file.csv") url = list(webpages)[0] tag = "" # Authenticate via OAuth client = pytumblr.TumblrRestClient() def get_all_webpages(filename): allwebpages = set() with open(filename, "r") as read: content = read.readlines() for line in content: line = line.split("\t") allwebpages.add(line[0].strip()) return allwebpages def write_set_to_file(settowrite, singlefile): with open(singlefile, "w") as write: for element in settowrite: write.write(element + "\n") return None def get_blog_post(mbti, url, offset): blogpost = client.posts(url, type='text', limit=20, offset=offset) try: if not blogpost["posts"]: print("Finished with blog: " + url) return "finished" url = blogpost["posts"][0]["post_url"] title = blogpost["posts"][0]["title"] body = blogpost["posts"][0]["body"] except: print(blogpost) if blogpost["meta"]["msg"] == "Limit Exceeded": print("Next offset for " + mbti + " is: " + str(offset)) with open("offset", "w") as write: write.write(str(offset)) return None try: os.mkdir(mbti) except: pass print(url) if re.compile(name_after_url).search(url) is not None: blog_name = re.search(blog_url, url.strip()) blog_name = blog_name.groups() blog_name = "-".join(blog_name) else: blog_name = re.search(digits_only, url.strip()) blog_name = blog_name.groups() blog_name = "".join(blog_name) with open(mbti + "/" + blog_name + ".txt", "w") as write: if not title: title = "" if not body: body = "" title = title.encode("utf8") body = body.encode("utf8") write.write(title + "\n\n\n" + body) return offset + 20 while 1: offset = get_blog_post(tag, url, offset) if offset == None: write_set_to_file(webpages, "blogsistp-cleaned-2.csv") sys.exit() if offset == "finished": webpages.remove(url) if not webpages: break url = list(webpages)[0] offset = 0 write_set_to_file(webpages, "file-out.csv")
Because of this restriction, I decided to abandon this way of getting information, since it would take at least a month to get the amount of data that I wanted.
Since I planed to use this for the school presentation as well, that was not something that I could afford. So I decided to try and find a different source of data.