/ Published in: R
                    
                                        
                            
                                Expand |
                                Embed | Plain Text
                            
                        
                        Copy this code and paste it in your HTML
library(lsa)
# create some files
td = tempfile()
dir.create(td)
write( c("dog", "cat", "mouse"), file=paste(td, "D1", sep="/"))
write( c("hamster", "mouse", "sushi"), file=paste(td, "D2", sep="/"))
write( c("dog", "monster", "monster"), file=paste(td, "D3", sep="/"))
write( c("dog", "mouse", "dog"), file=paste(td, "D4", sep="/"))
# read files into a document-term matrix
myMatrix = textmatrix(td, minWordLength=1) # textvector dla jednego pliku
myMatrix = lw_bintf(myMatrix) * gw_idf(myMatrix)
summary(myMatrix)
# create the latent semantic space
myLSAspace = lsa(myMatrix, dims=dimcalc_raw())
# display it as a textmatrix again
round(as.textmatrix(myLSAspace),2) # should give the original
# create the latent semantic space
myLSAspace = lsa(myMatrix, dims=dimcalc_share())
# display it as a textmatrix again
myNewMatrix = as.textmatrix(myLSAspace)
myNewMatrix # should look be different!
# compare two terms with the cosine measure
cosine(myNewMatrix["dog",], myNewMatrix["cat",])
# calc associations for mouse
associate(myNewMatrix, "mouse")
# demonstrate generation of a query
query("monster", rownames(myNewMatrix))
query("monster dog", rownames(myNewMatrix))
# compare two documents with pearson
cor(myNewMatrix[,1], myNewMatrix[,2], method="pearson")
# clean up
unlink(td, recursive=TRUE)
# [---]
# LSA search
q <- fold_in(query("sushi hamster", rownames(myNewMatrix)),myLSAspace) # query <> LSAspace
qd <- 0
for (i in 1:ncol(myNewMatrix)) {
qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))
}
#---
# sk - stala na potrzeby Query
sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk))
for (i in 1:length(myLSAspace$sk)) {
sk[i,i] <- myLSAspace$sk[i]
}
QueryVector <- function(p) {
q <- query(p, rownames(myNewMatrix))
t(q) %*% myLSAspace$tk %*% solve(sk)
}
SearchPhraseOld <- function(p) { # zwraca miary dla wszystkich dokumentow
q <- fold_in(query(p, rownames(myNewMatrix)),myLSAspace)
qd <- 0
for (i in 1:ncol(myNewMatrix)) {
qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))
}
qd
}
SearchPhrase <- function(p) {
q <- QueryVector(p)
qd <- 0
for (i in 1:nrow(myLSAspace$dk)) {
qd[i] <- cosine(as.vector(q),as.vector(myLSAspace$dk[i,]))
}
qd
}
# porownanie correlation i cosine distance dla search:
cor(q,myNewMatrix,method="spearman")
qd
# compare two phrases
ComparePhrasesOld <- function(p1, p2) {
q1 <- fold_in(query(p1, rownames(myNewMatrix)),myLSAspace)
q2 <- fold_in(query(p2, rownames(myNewMatrix)),myLSAspace)
cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor
}
ComparePhrases <- function(p1, p2) {
q1 <- QueryVector(p1)
q2 <- QueryVector(p2)
cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor
}
ComparePhrases("cat dog","monster dog")
ComparePhrases("sushi hamster","monster dog")
#---
q <- query("sushi hamster", rownames(myNewMatrix))
sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk))
for (i in 1:length(myLSAspace$sk)) {
sk[i,i] <- myLSAspace$sk[i]
}
t(q) %*% myLSAspace$tk %*% solve(sk)
URL: http://mjaniec.blogspot.com/2011/05/testing-lsa-in-r.html
Comments
 Subscribe to comments
                    Subscribe to comments
                
                