/ Published in: R
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
# http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf # http://en.wikipedia.org/wiki/PageRank#Computation # --- LIBRARIES library(gregmisc) library(tm) library(openNLP) library(graph) # --- FUNCTIONS SplitText <- function(Phrase) { # podziel tekst na slowa rozdzielone spacja (zakladam wczesniejsze pozbycie sie punctuation) unlist(strsplit(Phrase," ")) } IsPunctuated <- function(Phrase) { length(grep("\\.|,|!|\\?|;|:|\\)|]|}\\Z",Phrase,perl=TRUE))>0 # punctuation: . , ! ? ; : ) ] } } SelectTaggedWords <- function(Words,tagID) { # wybierz otagowane slowa Words[ grep(tagID,Words) ] } RemoveTags <- function(Words) { sub("/[A-Z]{2,3}","",Words) } IsSelectedWord <- function(Word) { # czy dane slowo jest znaczace?; korzysta z selected_words ifelse(length(which(selected_words == Word))>0, TRUE, FALSE) } GetWordLinks <- function(position,scope) { # znaczace slowa w odleglosci scope od slowa na pozycji position; korzysta z selected_words i words scope <- ifelse(position+scope>length(words),length(words),position+scope) links <- "" for (i in (position+1):scope) { if ( IsSelectedWord(words[i]) ) links <- c(links,words[i]) } if (length(links)>1) { links[2:length(links)] } else { links <- "" } } ConstructTextGraph <- function(n) { # budowa grafu; n - odleglosc dla co-ocurence; korzysta z words word_graph <- new("graphNEL") i <- 1 while (i < length(words) ) { if ( IsSelectedWord(words[i]) ) { # analizujemy tylko wybrane slowa (rzeczowniki i przymiotniki) links <- GetWordLinks(i,n) # sprawdzamy slowa w odleglosci n if (links[1] != "") { # istnieja powiazane slowa? cat(i," ",words[i]," - ",paste(c(links),collapse=" "),"\n") if ( length(which(nodes(word_graph)==words[i]))==0 ) { # potrzebny nowy wezel dla slowa bazowego? word_graph <- addNode(words[i],word_graph) } for (j in 1:length(links)) { if ( length(which(nodes(word_graph)==links[j]))==0 ) { # potrzebny nowy wezel dla slowa powiazanego? word_graph <- addNode(links[j],word_graph) word_graph <- addEdge(words[i],links[j],word_graph,1) } else { if ( length(which(edges(word_graph,links[j])[[1]]==words[i]))>0 ) { # czy istnieje juz krawedz miedzy wezlami? # szczegolny przypadek: krawedz do samego siebie prev_edge_weight <- as.numeric(edgeData(word_graph,words[i],links[j],"weight")) edgeData(word_graph,words[i],links[j],"weight") <- prev_edge_weight+1 } else { word_graph <- addEdge(words[i],links[j],word_graph,1) } } } } } i <- i+1 } word_graph } # --- MAIN CODE # --- TEXT PREPARATION doc <- c("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types.") corp <- Corpus(VectorSource(doc)) corp <- tm_map(corp, stripWhitespace) corp <- tm_map(corp, tolower) words_with_punctuation <- SplitText(as.character(corp[[1]])) corp <- tm_map(corp, removePunctuation) #--- GRAPH CONSTRUCTION words <- SplitText(as.character(corp[[1]])) tagged_text <- tagPOS(corp[[1]]) tagged_words <- SplitText(tagged_text) tagged_words <- c(SelectTaggedWords(tagged_words,"/NN"),SelectTaggedWords(tagged_words,"/JJ")) # wybieramy rzeczowniki i przymiotniki tagged_words <- RemoveTags(tagged_words) # usuwamy tagi POS selected_words <- unique(tagged_words) # redukujemy wektor znaczacych slow text_graph <- ConstructTextGraph(2) # 2 - odleglosc dla co-ocurence # --- GLOWNA PETLA PAGE RANK # na razie nie uwzgledniamy wag krawedzi d <- 0.85 # stala z PageRank threshold <- 1e-6 # poziom konwergencji text_nodes <- nodes(text_graph) nodes_num <- length(text_nodes) nodes_rank <- matrix(1,nodes_num,2) # 1 kol - nowe wartosci, 2 kol - stare wartosci k <- 0 # kontrolnie - liczba iteracji convergence_reached <- FALSE repeat { for (i in 1:nodes_num) { incoming_link <- adj(text_graph,text_nodes[i])[[1]] incoming_num <- length(incoming_link) tmp <- 0 for (j in 1:incoming_num) { link_num <- which(text_nodes==incoming_link[j]) outgoing_num <- length(adj(text_graph,text_nodes[link_num])[[1]]) tmp <- tmp + nodes_rank[link_num,1] / outgoing_num } nodes_rank[i,1] <- (1-d)+d*tmp } k <- k+1 for (i in 1:nodes_num) { if (abs(nodes_rank[i,1]-nodes_rank[i,2])<threshold) convergence_reached <- TRUE } if (convergence_reached) break nodes_rank[,2] <- nodes_rank[,1] } # --- POST-PROCESSING keywords_num <- round(nodes_num/3) ranked_words <- data.frame(text_nodes,nodes_rank[,1]) # data frame zawierajaca slowa i ich ranking names(ranked_words) <- c("word","rank") strong_words <- ranked_words[order(ranked_words$rank,decreasing=TRUE),] # sortowanie wg rankingu strong_words <- as.character(strong_words$word[1:keywords_num]) # redukcja do wektora slow keywords <- "" keywords_scores <- 0 for (i in 1:keywords_num) { keyword_positions <- which(words==strong_words[i]) for (j in 1:length(keyword_positions)) { keyword <- "" keyword_score <- 0 k <- keyword_positions[j] # sprawdzamy znaczace slowa PO mocnym slowie repeat { if (IsSelectedWord(words[k])) { keyword <- trim(paste(c(keyword,words[k]),collapse=" ")) keyword_score <- keyword_score + ranked_words[which(ranked_words$word==words[k]),2] } else break # koniec znaczacych slow if (IsPunctuated(words_with_punctuation[k])) break # koniec sekcji if (k==length(words)) break # koniec tekstu k <- k+1 } k <- keyword_positions[j]-1 # sprawdzamy znaczace slowa PRZED mocnym slowem repeat { if (k<1) break # poczatek tekstu if (IsSelectedWord(words[k])) { keyword <- paste(c(words[k],trim(keyword)),collapse=" ") keyword_score <- keyword_score + ranked_words[which(ranked_words$word==words[k]),2] } else break # koniec znaczacych slow if (k>1) { if (IsPunctuated(words_with_punctuation[k-1])) break # koniec sekcji slowo wczesniej } k <- k-1 } if (keyword!=strong_words[i]) { keywords <- c(keywords,keyword) keywords_scores <- c(keywords_scores,keyword_score) } } } keywords_df <- data.frame(keywords,keywords_scores) keywords_list <- keywords_df[order(keywords_df$keywords_scores,decreasing=TRUE),] # sortowanie wg rankingu keywords_list <- unique(as.character(keywords_list$keywords[1:nrow(keywords_list)])) # redukcja do wektora slow sort(keywords_list)
URL: http://mjaniec.blogspot.com/2011/05/textrank-implementation-in-r.html