#!/usr/bin/env newlisp

; Retrieves a page with 20 links via Google containing 
; the search word, then spawns 20 child processes, each
; retrieving one page from a link. All pages are then 
; HTML-cleaned, tokenized and words are counted using
; bayes-train.

; USAGE: query <search-word>

; or rename to query.cgi and run on a web server
; using the following query.html:

; NOTE!!! Google changes the formatting of links (see LINK_REGEX) once
; in a while and then the whole thing breaks! 


;<html> 
;<head><title>Query</title></head> 
;<body><font face="Verdana">
;<p>Parallel processing support on newLISP using 'spawn'</p>
;<form name="query" action="query.cgi" method="POST"> 
;Input a search word:&nbsp;<input type="text" name="query" size="48"> 
;&nbsp;<input type="submit" value="Submit" name="submit"> 
;</form> 
;</font></body></html> 

(set 'cgi-mode nil)
;(set 'cgi-mode true) ; run as CGI get question from query.html

(when cgi-mode 
    (module "cgi.lsp")
    (print "Content-Type: text/html\n\n"))

;; globals and constants in this namespace
(set 'LINK_REGEX {q=(http://[^>& ]+)&})
(set 'START_TIME 0)
(set 'SITES '())
(set 'CONTENT "")
(constant 'BRK (if cgi-mode "<br>" ""))

(define (url-decode str)
  (replace "%([0-9A-F][0-9A-F])" str (char (int $1 0 16)) 1))

(define (url-encode str)  
  (replace {([^a-zA-Z0-9])} str (format "%%%2X" (char $1)) 0))

(define (clean-html text)
	(replace "<script.*?</script>" text "" 4)
	(replace "<style.*?</style>" text "" 4)
	(replace "<head.*?</head>" text "" 4)
	(replace "<[^>]*>" text " " 0)
	(replace "." text " . ")
	(replace "," text " , ")
	(replace "!" text " ! ")
	(replace "?" text " ? ")
	(replace "\n" text " ")
	(replace "\r" text "")
	(replace "\t" text " ")
	(replace " +" text " " 4)
	; this should be a list of all ISO 8859-1 Symbols
	; see http://www.w3schools.com/tags/ref_entities.asp
	; HTML entities should be replaced with UTF-8
	(replace "&nbsp;" text " ")
	(replace "&mdash;" text " - ")
	(replace "&ndash;" text " - ")
	(replace "&quot;" text "'")
	(replace "&amp;" text "&")
	;(replace "&lrm;" text (char 8206))
	;(replace {\&#(\d+);} text (char (int $1)) 0)
)

(if cgi-mode
    (set 'question (CGI:get "query"))
    (set 'question (last (parse (main-args -1) "/"))) ; default is 'query'
)
        

(set 'page (get-url 
	(string "http://www.google.com/search?q=" (url-encode question)
			"&num=22&ie=UTF-8&oe=UTF-8")))

(set 'links (find-all LINK_REGEX page $1))

(when cgi-mode
    (println {<html><font face="Verdana"><head>})
    (println {<META http-equiv="Content-Type" CONTENT="text/html; charset=utf-8" />})
    (println {</head><body>}))

(println "query term: " (url-decode question) BRK)
(set 'START_TIME (time-of-day))

######################### this is, where all the pages are retrieved #################
; spawn a childprocess for each link
(dolist (lnk (0 20 links))
	(set 'pid (spawn 'page (get-url lnk 4000)))
	(push (list pid lnk) SITES -1))

; this gets executed whenever a page has been retrieved
(define (page-ready-event pid)
    (let (link (0 80 (lookup pid SITES)))
        (set 'link (url-decode link))
        (println (inc cnt) " pid:" pid " " (- (time-of-day) START_TIME) " ms " link BRK)
        (push (lower-case (clean-html page)) CONTENT -1)
        (inc xferred (length page)))
)

; start waiting for pages
(println "waiting: ..." BRK BRK)
(unless (sync 10000  page-ready-event)
	(println BRK "timeout" BRK))
#####################################################################################

(println BRK "bytes transferred: " xferred BRK)
(println "total time: " (- (time-of-day) START_TIME) " ms" BRK)

######################### all the counting is done here using bayes-train ###########
(catch (load "Lex") 'result)
(println "training: " (time (bayes-train (find-all {[a-zA-Z]+} CONTENT) 'Lex)) " ms" BRK)
(println "total words processed: " (Lex:total 0) BRK)
(println BRK)
#####################################################################################

; sort by frequency and print in four columns
(set 'items (sort (Lex) (fn (x y) (> (x 1 0) (y 1 0)))))
(set 'items (transpose (explode items (ceil (div (length items) 4)))))

(when cgi-mode (println "<pre>"))
(dolist (line items)
    (dolist (item line)
        (if (and item (< (length (item 0)) 19)) 
            (print (format "%-18s %5d, " (item 0) (item 1 0)))
            (print (format "%-18s      , " " ")))) ; nil
    (println))
		
(when cgi-mode
    (println "</pre>")
    (println "</body></font></html>"))

; if accumulating results over various queries
;(save "Lex" 'Lex)
(exit)