Finds synonyms of a word.

Finds a synonym of a word in a sentence, keeping its grammatical nature.

#!/usr/bin/python
# -*- coding: utf-8 -*-


# (c) Stéphanie Vilayphiou
# License: GNU-GPL 3
#
# This program is free software: you can redistribute it and/or 
# modify it under the terms of the GNU General Public License as published 
# by the Free Software Foundation, either version 3 of the License, 
# or any later version.
#
# Please don't forget to mention the author's name along your new 
# project as specified in the license.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from nltk import DefaultTagger, UnigramTagger, BigramTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords, brown
import random

####################################################
# TO DO
# 
# + better automatic tagging.
# + keep plurals and verb forms.
####################################################

# input
pangram = "The quick brown fox runs over the lazy dog"
pangram = pangram.split(' ')

# select English stopwords
stopwords = stopwords.words('english')


####################################################
# This section is recompiled from the Natural Language Processing Book: http://nltk.googlecode.com/svn/trunk/doc/book/book.html
brown_news_tagged = brown.tagged_sents(categories='news') # Automatic tagging of a sentence, based on Brown News corpus
size = int(len(brown_news_tagged) * 0.9)
brown_news_train = brown_news_tagged[:size]
unigram_tagger = UnigramTagger(brown_news_train)
# Uses BigramTagger -- if it fails, it uses the UnigramTagger -- if it fails, it uses DefaultTagger
t0 = DefaultTagger('NN')
t1 = UnigramTagger(brown_news_train, backoff=t0)
t2 = BigramTagger(brown_news_train, backoff=t1)
# Apply the automatic tagging to our list of words
tags = t2.tag(pangram)
print tags
####################################################

def pick_word():
        """
        Picks a random word of list of words which is not a stopword.
        """
        random_index = random.randint(0, len(tags)-1)
        target_word = tags[random_index]
        print target_word[0]
        if str(target_word[0]).lower() in stopwords:
                return pick_word()
        else:
                word = target_word[0]
                tag = target_word[1]
                return (word, tag, random_index)

def thesaurus():
        """
        Finds in Wordnet synonyms of a word of the same grammatical lexical category.
        """
        newText = ""
        synonyms = []

        picked_word = pick_word()

        word = picked_word[0]
        word_tag = picked_word[1]
        word_index = picked_word[2]

        # translates NLTK tagging to Wordnet tagging
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        adjectives = ['JJ', 'JJS', 'JJR']
        verbs = ['MD', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
        adverbs = ['RB', 'RBR', 'RBS', 'RP', 'WRB']
        if word_tag in nouns: word_tag = 'n' # noun
        elif word_tag in adjectives: word_tag = 'a' # adjective
        elif word_tag in verbs: word_tag = 'v' # verb
        elif word_tag in adverbs: word_tag = 'r' # adverb

        # lists all synonyms of the target word, respecting the lexical category.
        for synset in wn.synsets(word, pos=word_tag):
                for lemma in synset.lemma_names:
                        if str(lemma) != str(word):
                                synonyms.append(lemma)
        print synonyms

        try:
                # picks a random synonym from the synonyms list
                new_word = random.sample(synonyms, 1)[0]
        except:
                # if the anagram list is empty, returns the picked word
                new_word = word
        # replaces the target word by the found word in the list
        pangram[word_index] = new_word
        # makes a string out of the list
        newText = ' '.join(pangram).replace('_', ' ') + '.'
        return newText

result = thesaurus()
print result

Bazar/Synonym

Updated on 11/09/2011