#!/usr/bin/python3 import feedparser import confuse import redis import time import json import imgkit from diff_match_patch import diff_match_patch import difflib from pprint import pprint import hashlib # # Idea block: # # Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. # Ale nevím jestli to bude reálně efektivnější # # Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku config = confuse.Configuration('headline', __name__) config.set_file('config.yaml') dmp = diff_match_patch() rc = redis.Redis(host='localhost', port=6379, db=0) image_options = { 'width': '450', } def write_article(article, rc): rval = json.dumps(article['content']) rc.set(article['rss_id'], rval) def process_diff(diff, article): dmp.diff_cleanupSemantic(diff) html_diff = dmp.diff_prettyHtml(diff) filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg" image = imgkit.from_string(html_diff, filename, options = {'width': '450'}) return(True) def process_item(article, rc): if rc.exists(article['rss_id']): old = json.loads(rc.get(article['rss_id'])) new = article['content'] if old['title'] != new['title']: print('Article changed. Fuck the world.') diff = dmp.diff_main(old['title'], new['title']) process_diff(diff, article) #write_article(article_rc) return(True) else: # Article is the same. All good! return(True) else: # Article is new, just create it and exit write_article(article, rc) article_count = 0 for feed in config['feeds']: rss_source = str(feed['rss_source']) unique_tag = str(feed['unique_tag']) name = str(feed['name']) rss = feedparser.parse(rss_source) for item in rss['entries']: rss_id = item[unique_tag] title = item['title'] description = item['description'] published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed']) link = item['link'] article_data = { 'title' : title, 'description': description, 'published' : published, 'link' : link, 'medium' : name } article = { 'rss_id' : rss_id, 'content' : article_data } article_count += 1 process_item(article, rc) print("Processed articles:") print(article_count)