newlisp/examples/query

144 lines
4.5 KiB
Plaintext
Executable File

#!/usr/bin/env newlisp
; Retrieves a page with 20 links via Google containing
; the search word, then spawns 20 child processes, each
; retrieving one page from a link. All pages are then
; HTML-cleaned, tokenized and words are counted using
; bayes-train.
; USAGE: query <search-word>
; or rename to query.cgi and run on a web server
; using the following query.html:
; NOTE!!! Google changes the formatting of links (see LINK_REGEX) once
; in a while and then the whole thing breaks!
;<html>
;<head><title>Query</title></head>
;<body><font face="Verdana">
;<p>Parallel processing support on newLISP using 'spawn'</p>
;<form name="query" action="query.cgi" method="POST">
;Input a search word:&nbsp;<input type="text" name="query" size="48">
;&nbsp;<input type="submit" value="Submit" name="submit">
;</form>
;</font></body></html>
(set 'cgi-mode nil)
;(set 'cgi-mode true) ; run as CGI get question from query.html
(when cgi-mode
(module "cgi.lsp")
(print "Content-Type: text/html\n\n"))
;; globals and constants in this namespace
(set 'LINK_REGEX {q=(http://[^>& ]+)&})
(set 'START_TIME 0)
(set 'SITES '())
(set 'CONTENT "")
(constant 'BRK (if cgi-mode "<br>" ""))
(define (url-decode str)
(replace "%([0-9A-F][0-9A-F])" str (char (int $1 0 16)) 1))
(define (url-encode str)
(replace {([^a-zA-Z0-9])} str (format "%%%2X" (char $1)) 0))
(define (clean-html text)
(replace "<script.*?</script>" text "" 4)
(replace "<style.*?</style>" text "" 4)
(replace "<head.*?</head>" text "" 4)
(replace "<[^>]*>" text " " 0)
(replace "." text " . ")
(replace "," text " , ")
(replace "!" text " ! ")
(replace "?" text " ? ")
(replace "\n" text " ")
(replace "\r" text "")
(replace "\t" text " ")
(replace " +" text " " 4)
; this should be a list of all ISO 8859-1 Symbols
; see http://www.w3schools.com/tags/ref_entities.asp
; HTML entities should be replaced with UTF-8
(replace "&nbsp;" text " ")
(replace "&mdash;" text " - ")
(replace "&ndash;" text " - ")
(replace "&quot;" text "'")
(replace "&amp;" text "&")
;(replace "&lrm;" text (char 8206))
;(replace {\&#(\d+);} text (char (int $1)) 0)
)
(if cgi-mode
(set 'question (CGI:get "query"))
(set 'question (last (parse (main-args -1) "/"))) ; default is 'query'
)
(set 'page (get-url
(string "http://www.google.com/search?q=" (url-encode question)
"&num=22&ie=UTF-8&oe=UTF-8")))
(set 'links (find-all LINK_REGEX page $1))
(when cgi-mode
(println {<html><font face="Verdana"><head>})
(println {<META http-equiv="Content-Type" CONTENT="text/html; charset=utf-8" />})
(println {</head><body>}))
(println "query term: " (url-decode question) BRK)
(set 'START_TIME (time-of-day))
######################### this is, where all the pages are retrieved #################
; spawn a childprocess for each link
(dolist (lnk (0 20 links))
(set 'pid (spawn 'page (get-url lnk 4000)))
(push (list pid lnk) SITES -1))
; this gets executed whenever a page has been retrieved
(define (page-ready-event pid)
(let (link (0 80 (lookup pid SITES)))
(set 'link (url-decode link))
(println (inc cnt) " pid:" pid " " (- (time-of-day) START_TIME) " ms " link BRK)
(push (lower-case (clean-html page)) CONTENT -1)
(inc xferred (length page)))
)
; start waiting for pages
(println "waiting: ..." BRK BRK)
(unless (sync 10000 page-ready-event)
(println BRK "timeout" BRK))
#####################################################################################
(println BRK "bytes transferred: " xferred BRK)
(println "total time: " (- (time-of-day) START_TIME) " ms" BRK)
######################### all the counting is done here using bayes-train ###########
(catch (load "Lex") 'result)
(println "training: " (time (bayes-train (find-all {[a-zA-Z]+} CONTENT) 'Lex)) " ms" BRK)
(println "total words processed: " (Lex:total 0) BRK)
(println BRK)
#####################################################################################
; sort by frequency and print in four columns
(set 'items (sort (Lex) (fn (x y) (> (x 1 0) (y 1 0)))))
(set 'items (transpose (explode items (ceil (div (length items) 4)))))
(when cgi-mode (println "<pre>"))
(dolist (line items)
(dolist (item line)
(if (and item (< (length (item 0)) 19))
(print (format "%-18s %5d, " (item 0) (item 1 0)))
(print (format "%-18s , " " ")))) ; nil
(println))
(when cgi-mode
(println "</pre>")
(println "</body></font></html>"))
; if accumulating results over various queries
;(save "Lex" 'Lex)
(exit)