TextRank - keywords extraction


/ Published in: R
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
  2. # http://en.wikipedia.org/wiki/PageRank#Computation
  3.  
  4. # --- LIBRARIES
  5.  
  6. library(gregmisc)
  7. library(tm)
  8. library(openNLP)
  9. library(graph)
  10.  
  11. # --- FUNCTIONS
  12.  
  13. SplitText <- function(Phrase) { # podziel tekst na slowa rozdzielone spacja (zakladam wczesniejsze pozbycie sie punctuation)
  14.  
  15. unlist(strsplit(Phrase," "))
  16.  
  17. }
  18.  
  19. IsPunctuated <- function(Phrase) {
  20.  
  21. length(grep("\\.|,|!|\\?|;|:|\\)|]|}\\Z",Phrase,perl=TRUE))>0 # punctuation: . , ! ? ; : ) ] }
  22.  
  23. }
  24.  
  25. SelectTaggedWords <- function(Words,tagID) { # wybierz otagowane slowa
  26.  
  27. Words[ grep(tagID,Words) ]
  28.  
  29. }
  30.  
  31. RemoveTags <- function(Words) {
  32.  
  33. sub("/[A-Z]{2,3}","",Words)
  34.  
  35. }
  36.  
  37. IsSelectedWord <- function(Word) { # czy dane slowo jest znaczace?; korzysta z selected_words
  38.  
  39. ifelse(length(which(selected_words == Word))>0, TRUE, FALSE)
  40.  
  41. }
  42.  
  43.  
  44. GetWordLinks <- function(position,scope) { # znaczace slowa w odleglosci scope od slowa na pozycji position; korzysta z selected_words i words
  45.  
  46. scope <- ifelse(position+scope>length(words),length(words),position+scope)
  47.  
  48. links <- ""
  49. for (i in (position+1):scope) {
  50. if ( IsSelectedWord(words[i]) ) links <- c(links,words[i])
  51. }
  52.  
  53. if (length(links)>1) {
  54. links[2:length(links)]
  55. }
  56. else {
  57. links <- ""
  58. }
  59.  
  60. }
  61.  
  62. ConstructTextGraph <- function(n) { # budowa grafu; n - odleglosc dla co-ocurence; korzysta z words
  63.  
  64. word_graph <- new("graphNEL")
  65.  
  66. i <- 1
  67. while (i < length(words) ) {
  68.  
  69. if ( IsSelectedWord(words[i]) ) { # analizujemy tylko wybrane slowa (rzeczowniki i przymiotniki)
  70.  
  71. links <- GetWordLinks(i,n) # sprawdzamy slowa w odleglosci n
  72.  
  73. if (links[1] != "") { # istnieja powiazane slowa?
  74.  
  75. cat(i," ",words[i]," - ",paste(c(links),collapse=" "),"\n")
  76.  
  77. if ( length(which(nodes(word_graph)==words[i]))==0 ) { # potrzebny nowy wezel dla slowa bazowego?
  78.  
  79. word_graph <- addNode(words[i],word_graph)
  80.  
  81. }
  82.  
  83. for (j in 1:length(links)) {
  84.  
  85. if ( length(which(nodes(word_graph)==links[j]))==0 ) { # potrzebny nowy wezel dla slowa powiazanego?
  86.  
  87. word_graph <- addNode(links[j],word_graph)
  88. word_graph <- addEdge(words[i],links[j],word_graph,1)
  89.  
  90. }
  91. else {
  92.  
  93. if ( length(which(edges(word_graph,links[j])[[1]]==words[i]))>0 ) { # czy istnieje juz krawedz miedzy wezlami?
  94. # szczegolny przypadek: krawedz do samego siebie
  95.  
  96. prev_edge_weight <- as.numeric(edgeData(word_graph,words[i],links[j],"weight"))
  97.  
  98. edgeData(word_graph,words[i],links[j],"weight") <- prev_edge_weight+1
  99.  
  100. }
  101. else {
  102.  
  103. word_graph <- addEdge(words[i],links[j],word_graph,1)
  104.  
  105. }
  106.  
  107. }
  108.  
  109. }
  110.  
  111. }
  112.  
  113. }
  114.  
  115. i <- i+1
  116.  
  117. }
  118.  
  119. word_graph
  120.  
  121. }
  122.  
  123. # --- MAIN CODE
  124.  
  125. # --- TEXT PREPARATION
  126.  
  127. doc <- c("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types.")
  128.  
  129. corp <- Corpus(VectorSource(doc))
  130.  
  131. corp <- tm_map(corp, stripWhitespace)
  132.  
  133. corp <- tm_map(corp, tolower)
  134.  
  135. words_with_punctuation <- SplitText(as.character(corp[[1]]))
  136.  
  137. corp <- tm_map(corp, removePunctuation)
  138.  
  139. #--- GRAPH CONSTRUCTION
  140.  
  141. words <- SplitText(as.character(corp[[1]]))
  142.  
  143. tagged_text <- tagPOS(corp[[1]])
  144.  
  145. tagged_words <- SplitText(tagged_text)
  146. tagged_words <- c(SelectTaggedWords(tagged_words,"/NN"),SelectTaggedWords(tagged_words,"/JJ")) # wybieramy rzeczowniki i przymiotniki
  147. tagged_words <- RemoveTags(tagged_words) # usuwamy tagi POS
  148.  
  149. selected_words <- unique(tagged_words) # redukujemy wektor znaczacych slow
  150.  
  151. text_graph <- ConstructTextGraph(2) # 2 - odleglosc dla co-ocurence
  152.  
  153. # --- GLOWNA PETLA PAGE RANK
  154. # na razie nie uwzgledniamy wag krawedzi
  155.  
  156. d <- 0.85 # stala z PageRank
  157.  
  158. threshold <- 1e-6 # poziom konwergencji
  159.  
  160. text_nodes <- nodes(text_graph)
  161.  
  162. nodes_num <- length(text_nodes)
  163.  
  164. nodes_rank <- matrix(1,nodes_num,2) # 1 kol - nowe wartosci, 2 kol - stare wartosci
  165.  
  166. k <- 0 # kontrolnie - liczba iteracji
  167. convergence_reached <- FALSE
  168. repeat {
  169.  
  170. for (i in 1:nodes_num) {
  171.  
  172. incoming_link <- adj(text_graph,text_nodes[i])[[1]]
  173.  
  174. incoming_num <- length(incoming_link)
  175.  
  176. tmp <- 0
  177. for (j in 1:incoming_num) {
  178.  
  179. link_num <- which(text_nodes==incoming_link[j])
  180.  
  181. outgoing_num <- length(adj(text_graph,text_nodes[link_num])[[1]])
  182.  
  183. tmp <- tmp + nodes_rank[link_num,1] / outgoing_num
  184.  
  185. }
  186.  
  187. nodes_rank[i,1] <- (1-d)+d*tmp
  188.  
  189. }
  190.  
  191. k <- k+1
  192.  
  193. for (i in 1:nodes_num) {
  194.  
  195. if (abs(nodes_rank[i,1]-nodes_rank[i,2])<threshold) convergence_reached <- TRUE
  196.  
  197. }
  198.  
  199. if (convergence_reached) break
  200.  
  201. nodes_rank[,2] <- nodes_rank[,1]
  202.  
  203. }
  204.  
  205. # --- POST-PROCESSING
  206.  
  207. keywords_num <- round(nodes_num/3)
  208.  
  209. ranked_words <- data.frame(text_nodes,nodes_rank[,1]) # data frame zawierajaca slowa i ich ranking
  210. names(ranked_words) <- c("word","rank")
  211.  
  212. strong_words <- ranked_words[order(ranked_words$rank,decreasing=TRUE),] # sortowanie wg rankingu
  213.  
  214. strong_words <- as.character(strong_words$word[1:keywords_num]) # redukcja do wektora slow
  215.  
  216. keywords <- ""
  217. keywords_scores <- 0
  218.  
  219. for (i in 1:keywords_num) {
  220.  
  221. keyword_positions <- which(words==strong_words[i])
  222.  
  223. for (j in 1:length(keyword_positions)) {
  224.  
  225. keyword <- ""
  226. keyword_score <- 0
  227.  
  228. k <- keyword_positions[j] # sprawdzamy znaczace slowa PO mocnym slowie
  229. repeat {
  230.  
  231. if (IsSelectedWord(words[k])) {
  232. keyword <- trim(paste(c(keyword,words[k]),collapse=" "))
  233. keyword_score <- keyword_score + ranked_words[which(ranked_words$word==words[k]),2]
  234. }
  235. else break # koniec znaczacych slow
  236.  
  237. if (IsPunctuated(words_with_punctuation[k])) break # koniec sekcji
  238.  
  239. if (k==length(words)) break # koniec tekstu
  240.  
  241. k <- k+1
  242. }
  243.  
  244. k <- keyword_positions[j]-1 # sprawdzamy znaczace slowa PRZED mocnym slowem
  245.  
  246. repeat {
  247.  
  248. if (k<1) break # poczatek tekstu
  249.  
  250. if (IsSelectedWord(words[k])) {
  251. keyword <- paste(c(words[k],trim(keyword)),collapse=" ")
  252. keyword_score <- keyword_score + ranked_words[which(ranked_words$word==words[k]),2]
  253. }
  254. else break # koniec znaczacych slow
  255.  
  256. if (k>1) {
  257. if (IsPunctuated(words_with_punctuation[k-1])) break # koniec sekcji slowo wczesniej
  258. }
  259.  
  260. k <- k-1
  261.  
  262. }
  263.  
  264. if (keyword!=strong_words[i]) {
  265. keywords <- c(keywords,keyword)
  266. keywords_scores <- c(keywords_scores,keyword_score)
  267. }
  268.  
  269. }
  270.  
  271. }
  272.  
  273. keywords_df <- data.frame(keywords,keywords_scores)
  274.  
  275. keywords_list <- keywords_df[order(keywords_df$keywords_scores,decreasing=TRUE),] # sortowanie wg rankingu
  276.  
  277. keywords_list <- unique(as.character(keywords_list$keywords[1:nrow(keywords_list)])) # redukcja do wektora slow
  278.  
  279. sort(keywords_list)

URL: http://mjaniec.blogspot.com/2011/05/textrank-implementation-in-r.html

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.