Posted By

mjaniec on 05/15/11


Tagged

LSA


Versions (?)

Testing LSA in R


 / Published in: R
 

URL: http://mjaniec.blogspot.com/2011/05/testing-lsa-in-r.html

  1. library(lsa)
  2.  
  3. # create some files
  4. td = tempfile()
  5. dir.create(td)
  6. write( c("dog", "cat", "mouse"), file=paste(td, "D1", sep="/"))
  7. write( c("hamster", "mouse", "sushi"), file=paste(td, "D2", sep="/"))
  8. write( c("dog", "monster", "monster"), file=paste(td, "D3", sep="/"))
  9. write( c("dog", "mouse", "dog"), file=paste(td, "D4", sep="/"))
  10.  
  11. # read files into a document-term matrix
  12. myMatrix = textmatrix(td, minWordLength=1) # textvector dla jednego pliku
  13.  
  14. myMatrix = lw_bintf(myMatrix) * gw_idf(myMatrix)
  15.  
  16. summary(myMatrix)
  17.  
  18. # create the latent semantic space
  19. myLSAspace = lsa(myMatrix, dims=dimcalc_raw())
  20.  
  21. # display it as a textmatrix again
  22. round(as.textmatrix(myLSAspace),2) # should give the original
  23.  
  24. # create the latent semantic space
  25. myLSAspace = lsa(myMatrix, dims=dimcalc_share())
  26.  
  27. # display it as a textmatrix again
  28. myNewMatrix = as.textmatrix(myLSAspace)
  29. myNewMatrix # should look be different!
  30.  
  31. # compare two terms with the cosine measure
  32. cosine(myNewMatrix["dog",], myNewMatrix["cat",])
  33.  
  34. # calc associations for mouse
  35. associate(myNewMatrix, "mouse")
  36.  
  37. # demonstrate generation of a query
  38. query("monster", rownames(myNewMatrix))
  39. query("monster dog", rownames(myNewMatrix))
  40.  
  41. # compare two documents with pearson
  42. cor(myNewMatrix[,1], myNewMatrix[,2], method="pearson")
  43.  
  44. # clean up
  45. unlink(td, recursive=TRUE)
  46.  
  47. # [---]
  48.  
  49. # LSA search
  50. q <- fold_in(query("sushi hamster", rownames(myNewMatrix)),myLSAspace) # query <> LSAspace
  51.  
  52. qd <- 0
  53. for (i in 1:ncol(myNewMatrix)) {
  54.  
  55. qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))
  56.  
  57. }
  58.  
  59. #---
  60.  
  61. # sk - stala na potrzeby Query
  62.  
  63. sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk))
  64. for (i in 1:length(myLSAspace$sk)) {
  65. sk[i,i] <- myLSAspace$sk[i]
  66. }
  67.  
  68. QueryVector <- function(p) {
  69.  
  70. q <- query(p, rownames(myNewMatrix))
  71.  
  72. t(q) %*% myLSAspace$tk %*% solve(sk)
  73.  
  74. }
  75.  
  76. SearchPhraseOld <- function(p) { # zwraca miary dla wszystkich dokumentow
  77.  
  78. q <- fold_in(query(p, rownames(myNewMatrix)),myLSAspace)
  79.  
  80. qd <- 0
  81. for (i in 1:ncol(myNewMatrix)) {
  82.  
  83. qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))
  84.  
  85. }
  86.  
  87. qd
  88.  
  89. }
  90.  
  91. SearchPhrase <- function(p) {
  92.  
  93. q <- QueryVector(p)
  94.  
  95. qd <- 0
  96. for (i in 1:nrow(myLSAspace$dk)) {
  97.  
  98. qd[i] <- cosine(as.vector(q),as.vector(myLSAspace$dk[i,]))
  99.  
  100. }
  101.  
  102. qd
  103.  
  104.  
  105. }
  106.  
  107. # porownanie correlation i cosine distance dla search:
  108.  
  109. cor(q,myNewMatrix,method="spearman")
  110.  
  111. qd
  112.  
  113. # compare two phrases
  114.  
  115. ComparePhrasesOld <- function(p1, p2) {
  116.  
  117. q1 <- fold_in(query(p1, rownames(myNewMatrix)),myLSAspace)
  118. q2 <- fold_in(query(p2, rownames(myNewMatrix)),myLSAspace)
  119.  
  120. cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor
  121.  
  122. }
  123.  
  124.  
  125. ComparePhrases <- function(p1, p2) {
  126.  
  127. q1 <- QueryVector(p1)
  128. q2 <- QueryVector(p2)
  129.  
  130. cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor
  131.  
  132. }
  133.  
  134. ComparePhrases("cat dog","monster dog")
  135.  
  136. ComparePhrases("sushi hamster","monster dog")
  137.  
  138. #---
  139.  
  140. q <- query("sushi hamster", rownames(myNewMatrix))
  141.  
  142. sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk))
  143. for (i in 1:length(myLSAspace$sk)) {
  144. sk[i,i] <- myLSAspace$sk[i]
  145. }
  146.  
  147. t(q) %*% myLSAspace$tk %*% solve(sk)

Report this snippet  

You need to login to post a comment.