import re
import sys
import requests
import os
import subprocess
import time
import codecs
##### dictionary definition. This lists the channels that will be scraped from the website. Format:
#### "Name of the channel as it appears on the website":, "Number the channel will be assigned on the generated XML file"
channels_to_grab = {
"Cultura": "3.1",
"Univesp TV": "3.2",
"TV Educação": "3.3",
"Record News": "6.1",
"RecordTV Litoral-Vale": "8.1",
"Rede 21": "10.1",
"TVB Santos": "12.1",
"RedeTV!": "14.1",
"Rede Vida": "15.1",
"TV Tribuna Santos": "18.1",
"Gazeta": "32.1",
"ISTV": "36.1",
"RBI TV": "38.1",
"TV Unisantos": "40.1",
"VTV": "46.1",
"Santa Cecília TV": "52.1",
"IDTV": "56.1",
}
model_file = open("config_partial.xml","r");
config_file = codecs.open("WebGrab++.config.xml","w+","utf-8")
for line in model_file:
config_file.write(line);
model_file.close();
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36"
website = requests.get("http://www.lineup.tv.br/gdc.php", headers={"User-Agent": user_agent});
config_file.write("\r\n");
for f in channels_to_grab:
#this regex works because re.search looks for the given term in the entire string. Not only that, it only finds the proper "Guia=" because it looks for DIGITS after it. if it fins anything that's not digits (when it finds the first one), it skips that one and goes to the next
print('canal: ',f);
stripped = re.search('Guia=(\d+)">'+f+'<\/a>', website.text);
config_file.write(""+f+"\r\n");
#print(""+f+"\r");
config_file.write("\n");
config_file.close();
print("WebGrab config file generated with the current channel codes for the channels requested. There's no error handling, so it must mean that the program finished sucessfully.");