PythonでN-gram
大学の課題で出たので,簡易に実装してみた.
N-gramとは
自然言語処理の素性として良く使われる数量. 1-gram(uni-gram)だと,単語の頻度.2-gram(bi-gram)だと連続する二つの単語の出現頻度,3-gram(tri-gram)だと連続する三つの単語の出現頻度である.
Web文書を対象として,解析してみる.
クローラー
シードとなるURLから,HTML中のhref属性のURLを取得し,深さDまで繰り返し行う.
# coding:utf-8 import json import urllib2 from urlparse import urljoin from BeautifulSoup import * def crawl(pages, depth=2): setpages = set() for i in range(depth): newpages = set() for page in pages: try: c = urllib2.urlopen(page) except: print "Could not open %s" % page continue soup = BeautifulSoup(c.read()) setpages.add(page) links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] if url[0:4] == 'http' and not url in setpages: newpages.add(url) pages = newpages return list(setpages) if __name__ == "__main__": urls = ["http://news.yahoo.co.jp/"] pages = crawl(urls) f = open("./urls.json", "w") json.dump(pages, f) f.close()
N-gram
任意のNを指定し,N以下について計算する.
ベーシックに,形態素解析にはMeCabを,DOM解析にはBeautifulSoup.
# coding: utf-8 import sys import json import MeCab import urllib2 from collections import defaultdict from operator import itemgetter from BeautifulSoup import * class Ngram(): def __init__(self, N=3): self.N = N self.tagger = MeCab.Tagger("-O wakati") def get(self, text, ngram=None): seq = self.tagger.parse(text.encode('utf-8')).split() if ngram is None: ngram = [defaultdict(int) for x in range((self.N + 1))] ngram[0] = None for i in range(len(seq)): for n in range(1, self.N + 1): idx = i - n + 1 # check ngram is valid range if idx >= 0: key_words = [] for j in range(idx, i+1): key_words.append(seq[j]) key = '_'.join(key_words) ngram[n][key] += 1 return ngram class HTMLParser(): def get(self, url): try: c = urllib2.urlopen(url) except: print "Could not open %s" % url return "" soup = BeautifulSoup(c.read()) text = '\n'.join(self.__getNavigableStrings(soup)) return text def __getNavigableStrings(self, soup): if isinstance(soup, NavigableString): if type(soup) not in (Comment, Declaration) and soup.strip(): yield soup elif soup.name not in ('script', 'style'): for c in soup.contents: for g in self.__getNavigableStrings(c): yield g if __name__ == "__main__": f = open("urls.json", "r") urls = json.load(f) f.close() print "Count of urls : " + str(len(urls)) N = 10 hp = HTMLParser() ng = Ngram(N) ngram = None for url in urls: text = hp.get(url) ngram = ng.get(text, ngram) for n in range(1, (N + 1)): f = open('outputs/{:02d}.tsv'.format(n), 'w') out = "" for k, v in sorted(ngram[n].items(), key=itemgetter(1), reverse=True): out += "{}\t{}\n".format(k, v) f.write(out) f.close()
おわりに
とりあえずな実装です.
本当は,先頭と終端に記号が必要だけども...
関連記事