Revision: 46269
Updated Code
at May 18, 2011 01:43 by mjaniec
Updated Code
library(lsa) # create some files td = tempfile() dir.create(td) write( c("dog", "cat", "mouse"), file=paste(td, "D1", sep="/")) write( c("hamster", "mouse", "sushi"), file=paste(td, "D2", sep="/")) write( c("dog", "monster", "monster"), file=paste(td, "D3", sep="/")) write( c("dog", "mouse", "dog"), file=paste(td, "D4", sep="/")) # read files into a document-term matrix myMatrix = textmatrix(td, minWordLength=1) # textvector dla jednego pliku myMatrix = lw_bintf(myMatrix) * gw_idf(myMatrix) summary(myMatrix) # create the latent semantic space myLSAspace = lsa(myMatrix, dims=dimcalc_raw()) # display it as a textmatrix again round(as.textmatrix(myLSAspace),2) # should give the original # create the latent semantic space myLSAspace = lsa(myMatrix, dims=dimcalc_share()) # display it as a textmatrix again myNewMatrix = as.textmatrix(myLSAspace) myNewMatrix # should look be different! # compare two terms with the cosine measure cosine(myNewMatrix["dog",], myNewMatrix["cat",]) # calc associations for mouse associate(myNewMatrix, "mouse") # demonstrate generation of a query query("monster", rownames(myNewMatrix)) query("monster dog", rownames(myNewMatrix)) # compare two documents with pearson cor(myNewMatrix[,1], myNewMatrix[,2], method="pearson") # clean up unlink(td, recursive=TRUE) # [---] # LSA search q <- fold_in(query("sushi hamster", rownames(myNewMatrix)),myLSAspace) # query <> LSAspace qd <- 0 for (i in 1:ncol(myNewMatrix)) { qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i])) } #--- # sk - stala na potrzeby Query sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk)) for (i in 1:length(myLSAspace$sk)) { sk[i,i] <- myLSAspace$sk[i] } QueryVector <- function(p) { q <- query(p, rownames(myNewMatrix)) t(q) %*% myLSAspace$tk %*% solve(sk) } SearchPhraseOld <- function(p) { # zwraca miary dla wszystkich dokumentow q <- fold_in(query(p, rownames(myNewMatrix)),myLSAspace) qd <- 0 for (i in 1:ncol(myNewMatrix)) { qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i])) } qd } SearchPhrase <- function(p) { q <- QueryVector(p) qd <- 0 for (i in 1:nrow(myLSAspace$dk)) { qd[i] <- cosine(as.vector(q),as.vector(myLSAspace$dk[i,])) } qd } # porownanie correlation i cosine distance dla search: cor(q,myNewMatrix,method="spearman") qd # compare two phrases ComparePhrasesOld <- function(p1, p2) { q1 <- fold_in(query(p1, rownames(myNewMatrix)),myLSAspace) q2 <- fold_in(query(p2, rownames(myNewMatrix)),myLSAspace) cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor } ComparePhrases <- function(p1, p2) { q1 <- QueryVector(p1) q2 <- QueryVector(p2) cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor } ComparePhrases("cat dog","monster dog") ComparePhrases("sushi hamster","monster dog") #--- q <- query("sushi hamster", rownames(myNewMatrix)) sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk)) for (i in 1:length(myLSAspace$sk)) { sk[i,i] <- myLSAspace$sk[i] } t(q) %*% myLSAspace$tk %*% solve(sk)
Revision: 46268
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at May 15, 2011 23:13 by mjaniec
Initial Code
library(lsa) # create some files td = tempfile() dir.create(td) write( c("dog", "cat", "mouse"), file=paste(td, "D1", sep="/")) write( c("hamster", "mouse", "sushi"), file=paste(td, "D2", sep="/")) write( c("dog", "monster", "monster"), file=paste(td, "D3", sep="/")) write( c("dog", "mouse", "dog"), file=paste(td, "D4", sep="/")) # read files into a document-term matrix myMatrix = textmatrix(td, minWordLength=1) # textvector dla jednego pliku myMatrix = lw_bintf(myMatrix) * gw_idf(myMatrix) summary(myMatrix) # create the latent semantic space myLSAspace = lsa(myMatrix, dims=dimcalc_raw()) # display it as a textmatrix again round(as.textmatrix(myLSAspace),2) # should give the original # create the latent semantic space myLSAspace = lsa(myMatrix, dims=dimcalc_share()) # display it as a textmatrix again myNewMatrix = as.textmatrix(myLSAspace) myNewMatrix # should look be different! # compare two terms with the cosine measure cosine(myNewMatrix["dog",], myNewMatrix["cat",]) # calc associations for mouse associate(myNewMatrix, "mouse") # demonstrate generation of a query query("monster", rownames(myNewMatrix)) query("monster dog", rownames(myNewMatrix)) # compare two documents with pearson cor(myNewMatrix[,1], myNewMatrix[,2], method="pearson") # clean up unlink(td, recursive=TRUE) # [---] # LSA search q <- fold_in(query("sushi hamster", rownames(myNewMatrix)),myLSAspace) # query <> LSAspace qd <- 0 for (i in 1:ncol(myNewMatrix)) { qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i])) } # porownanie correlation i cosine distance dla search: cor(q,myNewMatrix,method="spearman") qd # compare two phrases ComparePhrases <- function(p1, p2) { q1 <- fold_in(query(p1, rownames(myNewMatrix)),myLSAspace) q2 <- fold_in(query(p2, rownames(myNewMatrix)),myLSAspace) cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor } ComparePhrases("cat dog","monster dog") ComparePhrases("sushi hamster","monster dog")
Initial URL
http://mjaniec.blogspot.com/2011/05/testing-lsa-in-r.html
Initial Description
Initial Title
Testing LSA in R
Initial Tags
Initial Language
R