WIPOのpatentscopeで作成した
RSSからタイトルとサマリを抜き取って
英数字以外で区切った単語リストを作るスクリプト。
sys.argv[1]にRSSのリストをしていすると。
パテント番号と単語の数を出力する。
が
ターゲットや疾患がばっちりサマリに載ってる訳でもないしなあ。
ドキュメントでクラスタライングするのは難しいかな。
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import feedparser import sys import urllib2 def get_patentwordcount(entry): word_dict = {} summary = entry['summary'].encode('utf_8') patent_no = re.compile(r"WO\d+").search(entry["link"].encode('utf_8')) words = getwords(summary + ' ' + entry["title"].encode("utf_8")) # print patent_no.group(0) for word in words: word_dict.setdefault(word,0) word_dict[word] +=1 return patent_no.group(0), word_dict def getwords(html): txt = re.compile(r'<[^>]+>').sub(' ',html) words = re.compile(r'[^A-Za-z]+').split(txt) return [word.lower() for word in words if word != ''] apcount = {} wordcounts = {} urllist = [line for line in open(sys.argv[1],'r')] feedslist = [feedparser.parse(url) for url in urllist] for feed in feedslist: try: for entry in feed.entries: patent_no, wc = get_patentwordcount(entry) wordcounts[patent_no] = wc for word, count in wc.items(): apcount.setdefault(word, 0) if count > 1: apcount[word] +=1 for word, count in wc.items(): apcount.setdefault(word,0) if count > 1: apcount[word] += 1 except: print "Failed to parse feed %s" % feed wordlist = [] for w, cc in apcount.items(): wordlist.append(w) print len(wordlist) """ for w, bc in apcount.items(): frac = float(bc)/len(feedlist) if frac >0.1 and frac <0.5: wordlist.append(w) """ out = file("patenrdata.txt","w") out.write("PATENT_NO") for word in wordlist: out.write('\t%s' % word) out.write('\n') for patent_no, wc in wordcounts.items(): out.write(patent_no) for word in wordlist: if word in wc: out.write("\t%d" % wc[word]) else: out.write("\t0") out.write("\n")