part 1 of spam detection
DESCRIPTION
Algorithms to pull out certain chunks of emails to start seem detection process.TRANSCRIPT
Spam Emails pt 1-‐-‐-‐-‐-‐-‐Austin Kinion Algorithm for Detecting Spam emails part 1
setwd("~/Desktop/SAT/easy_ham") filename = list.files()[567] filename #First Function for finding header# mail= function(filename){ #Read email# con= file(filename, open='rt') text= readLines(con) #Stop header when there is a blank line# End.of.Header = which(text== "")[1] #Stop the header at either: if(grepl('From ', text[1]) == TRUE) head.lines= text [2: End.of.Header] else head.lines= text[1: End.of.Header] file= textConnection(head.lines) head= read.dcf(file, all=TRUE) #Must close connection every time or an error will occur.# close(con) if ("Content-‐Type" %in% colnames(head)){ content.type= head[["Content-‐Type"]] return(list( head, content.type )) } else return(head) }
testemail= mail(filename) content.type= testemail[[2]] content.type
#Find Boundary and take out.# Boundary= function(content.type){ if(grepl('boundary', content.type) == TRUE){ split= strsplit(contenttype, "=") if(length(split[[1]]) > 2) { a=split[[1]][-‐1] boundary= paste(a, collapse='=') } else boundary= split[[1]][2] if(grepl('"', boundary)) #Substitute quotes with nothing.# gsub('"', '', boundary) boundary } } boundary= Boundary(content.type) boundary
#BODY of email and boundary signifier# body.and.bound = function(content.type, filename, boundary) { #When there IS attachment in the email# if (grepl('boundary', content.type) == TRUE){ boundary = Boundary(content.type) text = readLines(filename) #Signify boundary by adding -‐-‐'s to it.# add.to.boundary = paste("-‐-‐", boundary, sep = "")
#Add -‐-‐ to final boundary# final.boundary = paste(add.to.boundary, "-‐-‐", sep = "") #Signify the text of the body as where text is = add to boundary# body.text = which(text == add.to.boundary) #Blank is when ther is nothing there# blank = which(text == "") #Signify the last line of email# last.line = which(text == final.boundary) #The body of the email will reside between first blank line and attachment# body.bound = text[(blank[1]) : (body.text[1] -‐ 1)] Body = paste(body.bound, collapse = " ") } #When there is NO attachment in the email# else { text = readLines(filename) #End of header will be where there is a blank line# End.of.Header = which(text == "")[1] body = text[End.of.Header:length(text)] #Body of email is then between end of header and first boundary -‐-‐# Body = paste(body, collapse = '-‐-‐') body.text = "" boundary = "" last.line = "" } return(list(boundary, text, body.text, last.line, Body)) } output= body.and.bound(content.type, filename, boundary) output
#Find attachment as list# find.attach = function(boundary = output[[1]], text = output[[2]], body.text = output[[3]], last.line = output[[4]]) { attachment = list()
if (length(boundary) == 0) { #Return blank space when length of boundary is 0# Attachment = "" return(Attachment) } #Return attachment when length of last line is >0# if (length(last.line) > 0) #When there is exactly 1 attachment, then split.# if (length(body.text) == 1) { attachment[[1]] = text[(body.text[1] + 1) : (last.line -‐ 1)] Attachment = list() attach.one = attachment[[1]][1] split = strsplit(attach.one, ";") attach.of.body = attachment[[1]][-‐1] attach.of.body = paste(attach.of.body, collapse = " ") attach.of.header = split[[1]][1] attach.list = list(attach.of.header, attach.of.body) names(attach.list) = c('Header of email', 'Body of email') Attachment[[1]] = attach.list } ##When there is more than one attachment, then split.# else { for ( x in 1: (length(body.text) -‐ 1)) attachment[[x]] = text[(body.text[x] + 1) : (body.text[x + 1] -‐ 1)] attachment[[length(attachment) + 1]] = text[(body.text[length(body.text)] + 1) : (last.line -‐ 1)] Attachment = list() for (i in 1 :length(attachment)){ attach.one = attachment[[i]][1]
split = strsplit(attach.one, ";") attach.of.header = split[[1]][1] attach.of.body = attachment[[i]][-‐1] attach.of.body = paste(attach.of.body, collapse = " ") attach.list = list(attach.of.header, attach.of.body) names(attach.list) = c('Header', 'Body') Attachment[[i]] = attach.list } } return(Attachment) } end.of.boundary= find.attach(boundary = output[[1]], text = output[[2]], body.text = output[[3]], last.line = output[[4]])
#Find attachment when special condition of NO end boundary# special.cond = function(boundary = output[[1]], text = output[[2]], body.text = output[[3]], last.line = output[[4]]) { attachment = list() #Return blank space when length of boundary is 0# if (length(boundary) == 0){ Attachment = "" return(Attachment) } # Want Return attachment when length of last line is 0# if (length(last.line) == 0) { #When there is exactly 1 attachment# if (length(body.text) == 1) { attachment[[1]] = text[(body.text[1] + 1) : length(text)] Attachment = list() attach.one = attachment[[1]][1] split = strsplit(attach.one, ";")
attach.of.header = split[[1]][1] attach.of.body = attachment[[1]][-‐1] attach.of.body = paste(attach.of.body, collapse = " ") attach.list = list(attach.of.header, attach.of.body) names(attach.list) = c('Header of email', 'Body of email') Attachment[[1]] = attach.list } #When there is more than one attachment# else { for (x in 1 : (length(body.text) -‐ 1)) attachment[[x]] = text[(body.text[x] + 1) : (body.text[x + 1] -‐ 1)] attachment[[length(attachment) + 1]] = text[(body.text[length(body.text)] + 1) : length(text)] Attachment = list() for (x in 1:length(attachment)) { attach.one = attachment[[x]][1] split = strsplit(attach.one, ";") attach.of.header = split[[1]][1] attach.of.body = attachment[[x]][-‐1] attach.of.body = paste(attach.of.body, collapse = " ") attach.list = list(attach.of.header, attach.of.body) names(attach.list) = c('Header of email', 'Body of email') Attachment[[x]] = attach.list } } } return(Attachment) } no.bound.spec.cond = special.cond(boundary = output[[1]], text =
output[[2]], body.text = output[[3]], last.line = output[[4]])
#For the last boundary in the email!!# last.boundary = function(boundary = output[[1]], text = output[[2]], body.text = output[[3]], last.line = output[[4]]) { if(length(last.line) == 0){ Attachment = special.cond(boundary = output[[1]], text = output[[2]], body.text = output[[3]], last.line = output[[4]])} if(length(last.line) > 0) { Attachment = find.attach(boundary = output[[1]], text = output[[2]],body.text = output[[3]], last.line = output[[4]]) } return(Attachment) } last.bound = last.boundary(boundary = output[[1]], text = output[[2]],body.text = output[[3]], last.line = output[[4]])
# Get list of subdirectories. dirs = list.files() dirs # Iterate over subdirectories. for (dir in dirs) { # Change to current subdirectory. setwd(dir) # List files in subdirectory. files = list.files() for (f in files) { # do something to file `f` # process_email(f) print(f) } # Go back up to the parent directory. setwd('..') } path = list.files('SAT/', recursive = TRUE) names(trainMessages)= path trainMessages save(trainMessages, file = "TrainingMessages.rda")()