def parse_file(ciff_file, eval_mode=false)
ads = Array.new
File.open(ciff_file) do |f|
while line = f.gets
if line =~ AD_START_PAT .. line =~ AD_END_PAT
if line =~ AD_START_PAT
text = ""
cat = "missing_cat"
ad_id = "no_id_found"
fielded_atts = Hash.new
elsif line =~ ATTR_PAT
name = line.sub(ATTR_PAT,'\1').chomp
val = line.sub(ATTR_PAT,'\2').chomp
if name == "adPrintID"
ad_id = val
elsif name == "category"
cat = val
else
fielded_atts[name] = val unless IGNORE.include?(name)
end
elsif line =~ TEXT_START_PAT .. line =~ TEXT_END_PAT
text += line.sub(TEXT_START_PAT, "").sub(TEXT_END_PAT, "").chomp
if line =~ TEXT_END_PAT
text.gsub!(/<.*?>/," ")
text.squeeze!(" ")
end
elsif line =~ AD_END_PAT
if eval_mode
ads.push(DataHarvester::Ad.new(text, ad_id, cat, fielded_atts))
else
ads.push(DataHarvester::Ad.new(text, ad_id, cat))
end
end
end
end
end
return ads
end