巨人の肩の上に登る

先人の積み重ねた発見に基づいて、なにかを発見しようとすることを指す。

PythonでN-gram

大学の課題で出たので,簡易に実装してみた.

N-gramとは

自然言語処理の素性として良く使われる数量. 1-gram(uni-gram)だと,単語の頻度.2-gram(bi-gram)だと連続する二つの単語の出現頻度,3-gram(tri-gram)だと連続する三つの単語の出現頻度である.

Web文書を対象として,解析してみる.

クローラー

シードとなるURLから,HTML中のhref属性のURLを取得し,深さDまで繰り返し行う.

# coding:utf-8

import json
import urllib2
from urlparse import urljoin
from BeautifulSoup import *


def crawl(pages, depth=2):
    setpages = set()

    for i in range(depth):
        newpages = set()
        for page in pages:
            try:
                c = urllib2.urlopen(page)
            except:
                print "Could not open %s" % page
                continue
            soup = BeautifulSoup(c.read())
            setpages.add(page)

            links = soup('a')
            for link in links:
                if ('href' in dict(link.attrs)):
                    url = urljoin(page, link['href'])
                    if url.find("'") != -1: continue
                    url = url.split('#')[0]
                    if url[0:4] == 'http' and not url in setpages:
                        newpages.add(url)
        pages = newpages

    return list(setpages)


if __name__ == "__main__":
    urls = ["http://news.yahoo.co.jp/"]

    pages = crawl(urls)

    f = open("./urls.json", "w")
    json.dump(pages, f)
    f.close()

N-gram

任意のNを指定し,N以下について計算する.
ベーシックに,形態素解析にはMeCabを,DOM解析にはBeautifulSoup.

# coding: utf-8

import sys
import json
import MeCab
import urllib2
from collections import defaultdict
from operator import itemgetter
from BeautifulSoup import *


class Ngram():

    def __init__(self, N=3):
        self.N = N
        self.tagger = MeCab.Tagger("-O wakati")

    def get(self, text, ngram=None):
        seq = self.tagger.parse(text.encode('utf-8')).split()

        if ngram is None:
            ngram = [defaultdict(int) for x in range((self.N + 1))]
            ngram[0] = None

        for i in range(len(seq)):
            for n in range(1, self.N + 1):
                idx = i - n + 1  # check ngram is valid range
                if idx >= 0:
                    key_words = []
                    for j in range(idx, i+1):
                        key_words.append(seq[j])
                    key = '_'.join(key_words)
                    ngram[n][key] += 1

        return ngram


class HTMLParser():

    def get(self, url):
        try:
            c = urllib2.urlopen(url)
        except:
            print "Could not open %s" % url
            return ""

        soup = BeautifulSoup(c.read())
        text = '\n'.join(self.__getNavigableStrings(soup))
        return text

    def __getNavigableStrings(self, soup):
      if isinstance(soup, NavigableString):
        if type(soup) not in (Comment, Declaration) and soup.strip():
          yield soup
      elif soup.name not in ('script', 'style'):
        for c in soup.contents:
          for g in self.__getNavigableStrings(c):
            yield g


if __name__ == "__main__":

    f = open("urls.json", "r")
    urls = json.load(f)
    f.close()
    print "Count of urls : " + str(len(urls))

    N = 10
    hp = HTMLParser()
    ng = Ngram(N)

    ngram = None
    for url in urls:
        text = hp.get(url)
        ngram = ng.get(text, ngram)

    for n in range(1, (N + 1)):
        f = open('outputs/{:02d}.tsv'.format(n), 'w')
        out = ""
        for k, v in sorted(ngram[n].items(), key=itemgetter(1), reverse=True):
            out += "{}\t{}\n".format(k, v)
        f.write(out)
        f.close()

おわりに

とりあえずな実装です.
本当は,先頭と終端に記号が必要だけども...


関連記事