Finds synonyms of a word.
Finds a synonym of a word in a sentence, keeping its grammatical nature.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# (c) Stéphanie Vilayphiou
# License: GNU-GPL 3
#
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License,
# or any later version.
#
# Please don't forget to mention the author's name along your new
# project as specified in the license.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from nltk import DefaultTagger, UnigramTagger, BigramTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords, brown
import random
####################################################
# TO DO
#
# + better automatic tagging.
# + keep plurals and verb forms.
####################################################
# input
pangram = "The quick brown fox runs over the lazy dog"
pangram = pangram.split(' ')
# select English stopwords
stopwords = stopwords.words('english')
####################################################
# This section is recompiled from the Natural Language Processing Book: http://nltk.googlecode.com/svn/trunk/doc/book/book.html
brown_news_tagged = brown.tagged_sents(categories='news') # Automatic tagging of a sentence, based on Brown News corpus
size = int(len(brown_news_tagged) * 0.9)
brown_news_train = brown_news_tagged[:size]
unigram_tagger = UnigramTagger(brown_news_train)
# Uses BigramTagger -- if it fails, it uses the UnigramTagger -- if it fails, it uses DefaultTagger
t0 = DefaultTagger('NN')
t1 = UnigramTagger(brown_news_train, backoff=t0)
t2 = BigramTagger(brown_news_train, backoff=t1)
# Apply the automatic tagging to our list of words
tags = t2.tag(pangram)
print tags
####################################################
def pick_word():
"""
Picks a random word of list of words which is not a stopword.
"""
random_index = random.randint(0, len(tags)-1)
target_word = tags[random_index]
print target_word[0]
if str(target_word[0]).lower() in stopwords:
return pick_word()
else:
word = target_word[0]
tag = target_word[1]
return (word, tag, random_index)
def thesaurus():
"""
Finds in Wordnet synonyms of a word of the same grammatical lexical category.
"""
newText = ""
synonyms = []
picked_word = pick_word()
word = picked_word[0]
word_tag = picked_word[1]
word_index = picked_word[2]
# translates NLTK tagging to Wordnet tagging
nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
adjectives = ['JJ', 'JJS', 'JJR']
verbs = ['MD', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
adverbs = ['RB', 'RBR', 'RBS', 'RP', 'WRB']
if word_tag in nouns: word_tag = 'n' # noun
elif word_tag in adjectives: word_tag = 'a' # adjective
elif word_tag in verbs: word_tag = 'v' # verb
elif word_tag in adverbs: word_tag = 'r' # adverb
# lists all synonyms of the target word, respecting the lexical category.
for synset in wn.synsets(word, pos=word_tag):
for lemma in synset.lemma_names:
if str(lemma) != str(word):
synonyms.append(lemma)
print synonyms
try:
# picks a random synonym from the synonyms list
new_word = random.sample(synonyms, 1)[0]
except:
# if the anagram list is empty, returns the picked word
new_word = word
# replaces the target word by the found word in the list
pangram[word_index] = new_word
# makes a string out of the list
newText = ' '.join(pangram).replace('_', ' ') + '.'
return newText
result = thesaurus()
print result
Bazar/Synonym
Updated on 11/09/2011