Revision: 45936
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at May 11, 2011 09:14 by mjaniec
Initial Code
# http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
# http://en.wikipedia.org/wiki/PageRank#Computation
# --- LIBRARIES
library(gregmisc)
library(tm)
library(openNLP)
library(graph)
# --- FUNCTIONS
SplitText <- function(Phrase) { # podziel tekst na slowa rozdzielone spacja (zakladam wczesniejsze pozbycie sie punctuation)
unlist(strsplit(Phrase," "))
}
IsPunctuated <- function(Phrase) {
length(grep("\\.|,|!|\\?|;|:|\\)|]|}\\Z",Phrase,perl=TRUE))>0 # punctuation: . , ! ? ; : ) ] }
}
SelectTaggedWords <- function(Words,tagID) { # wybierz otagowane slowa
Words[ grep(tagID,Words) ]
}
RemoveTags <- function(Words) {
sub("/[A-Z]{2,3}","",Words)
}
IsSelectedWord <- function(Word) { # czy dane slowo jest znaczace?; korzysta z selected_words
ifelse(length(which(selected_words == Word))>0, TRUE, FALSE)
}
GetWordLinks <- function(position,scope) { # znaczace slowa w odleglosci scope od slowa na pozycji position; korzysta z selected_words i words
scope <- ifelse(position+scope>length(words),length(words),position+scope)
links <- ""
for (i in (position+1):scope) {
if ( IsSelectedWord(words[i]) ) links <- c(links,words[i])
}
if (length(links)>1) {
links[2:length(links)]
}
else {
links <- ""
}
}
ConstructTextGraph <- function(n) { # budowa grafu; n - odleglosc dla co-ocurence; korzysta z words
word_graph <- new("graphNEL")
i <- 1
while (i < length(words) ) {
if ( IsSelectedWord(words[i]) ) { # analizujemy tylko wybrane slowa (rzeczowniki i przymiotniki)
links <- GetWordLinks(i,n) # sprawdzamy slowa w odleglosci n
if (links[1] != "") { # istnieja powiazane slowa?
cat(i," ",words[i]," - ",paste(c(links),collapse=" "),"\n")
if ( length(which(nodes(word_graph)==words[i]))==0 ) { # potrzebny nowy wezel dla slowa bazowego?
word_graph <- addNode(words[i],word_graph)
}
for (j in 1:length(links)) {
if ( length(which(nodes(word_graph)==links[j]))==0 ) { # potrzebny nowy wezel dla slowa powiazanego?
word_graph <- addNode(links[j],word_graph)
word_graph <- addEdge(words[i],links[j],word_graph,1)
}
else {
if ( length(which(edges(word_graph,links[j])[[1]]==words[i]))>0 ) { # czy istnieje juz krawedz miedzy wezlami?
# szczegolny przypadek: krawedz do samego siebie
prev_edge_weight <- as.numeric(edgeData(word_graph,words[i],links[j],"weight"))
edgeData(word_graph,words[i],links[j],"weight") <- prev_edge_weight+1
}
else {
word_graph <- addEdge(words[i],links[j],word_graph,1)
}
}
}
}
}
i <- i+1
}
word_graph
}
# --- MAIN CODE
# --- TEXT PREPARATION
doc <- c("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types.")
corp <- Corpus(VectorSource(doc))
corp <- tm_map(corp, stripWhitespace)
corp <- tm_map(corp, tolower)
words_with_punctuation <- SplitText(as.character(corp[[1]]))
corp <- tm_map(corp, removePunctuation)
#--- GRAPH CONSTRUCTION
words <- SplitText(as.character(corp[[1]]))
tagged_text <- tagPOS(corp[[1]])
tagged_words <- SplitText(tagged_text)
tagged_words <- c(SelectTaggedWords(tagged_words,"/NN"),SelectTaggedWords(tagged_words,"/JJ")) # wybieramy rzeczowniki i przymiotniki
tagged_words <- RemoveTags(tagged_words) # usuwamy tagi POS
selected_words <- unique(tagged_words) # redukujemy wektor znaczacych slow
text_graph <- ConstructTextGraph(2) # 2 - odleglosc dla co-ocurence
# --- GLOWNA PETLA PAGE RANK
# na razie nie uwzgledniamy wag krawedzi
d <- 0.85 # stala z PageRank
threshold <- 1e-6 # poziom konwergencji
text_nodes <- nodes(text_graph)
nodes_num <- length(text_nodes)
nodes_rank <- matrix(1,nodes_num,2) # 1 kol - nowe wartosci, 2 kol - stare wartosci
k <- 0 # kontrolnie - liczba iteracji
convergence_reached <- FALSE
repeat {
for (i in 1:nodes_num) {
incoming_link <- adj(text_graph,text_nodes[i])[[1]]
incoming_num <- length(incoming_link)
tmp <- 0
for (j in 1:incoming_num) {
link_num <- which(text_nodes==incoming_link[j])
outgoing_num <- length(adj(text_graph,text_nodes[link_num])[[1]])
tmp <- tmp + nodes_rank[link_num,1] / outgoing_num
}
nodes_rank[i,1] <- (1-d)+d*tmp
}
k <- k+1
for (i in 1:nodes_num) {
if (abs(nodes_rank[i,1]-nodes_rank[i,2])<threshold) convergence_reached <- TRUE
}
if (convergence_reached) break
nodes_rank[,2] <- nodes_rank[,1]
}
# --- POST-PROCESSING
keywords_num <- round(nodes_num/3)
ranked_words <- data.frame(text_nodes,nodes_rank[,1]) # data frame zawierajaca slowa i ich ranking
names(ranked_words) <- c("word","rank")
strong_words <- ranked_words[order(ranked_words$rank,decreasing=TRUE),] # sortowanie wg rankingu
strong_words <- as.character(strong_words$word[1:keywords_num]) # redukcja do wektora slow
keywords <- ""
keywords_scores <- 0
for (i in 1:keywords_num) {
keyword_positions <- which(words==strong_words[i])
for (j in 1:length(keyword_positions)) {
keyword <- ""
keyword_score <- 0
k <- keyword_positions[j] # sprawdzamy znaczace slowa PO mocnym slowie
repeat {
if (IsSelectedWord(words[k])) {
keyword <- trim(paste(c(keyword,words[k]),collapse=" "))
keyword_score <- keyword_score + ranked_words[which(ranked_words$word==words[k]),2]
}
else break # koniec znaczacych slow
if (IsPunctuated(words_with_punctuation[k])) break # koniec sekcji
if (k==length(words)) break # koniec tekstu
k <- k+1
}
k <- keyword_positions[j]-1 # sprawdzamy znaczace slowa PRZED mocnym slowem
repeat {
if (k<1) break # poczatek tekstu
if (IsSelectedWord(words[k])) {
keyword <- paste(c(words[k],trim(keyword)),collapse=" ")
keyword_score <- keyword_score + ranked_words[which(ranked_words$word==words[k]),2]
}
else break # koniec znaczacych slow
if (k>1) {
if (IsPunctuated(words_with_punctuation[k-1])) break # koniec sekcji slowo wczesniej
}
k <- k-1
}
if (keyword!=strong_words[i]) {
keywords <- c(keywords,keyword)
keywords_scores <- c(keywords_scores,keyword_score)
}
}
}
keywords_df <- data.frame(keywords,keywords_scores)
keywords_list <- keywords_df[order(keywords_df$keywords_scores,decreasing=TRUE),] # sortowanie wg rankingu
keywords_list <- unique(as.character(keywords_list$keywords[1:nrow(keywords_list)])) # redukcja do wektora slow
sort(keywords_list)
Initial URL
http://mjaniec.blogspot.com/2011/05/textrank-implementation-in-r.html
Initial Description
Initial Title
TextRank - keywords extraction
Initial Tags
Initial Language
R