97 lines
2.4 KiB
Go
97 lines
2.4 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"log"
|
|
"time"
|
|
|
|
rss "github.com/ungerik/go-rss"
|
|
)
|
|
|
|
type GenericRSSFetcher struct {
|
|
// MedioName es el nombre para poner en la BD (field "medio")
|
|
MedioName string
|
|
FeedURL string
|
|
DB *sql.DB
|
|
|
|
// SubsetName es un nombre para un feed específico que se usa para debug
|
|
SubsetName string
|
|
|
|
PubDateFormat string
|
|
}
|
|
|
|
// SQLiteTimestampFormats[0]
|
|
const DefaultDateFormat = "2006-01-02 15:04:05.999999999-07:00"
|
|
|
|
// TODO: Capturar errores
|
|
func (fetcher GenericRSSFetcher) Fetch() {
|
|
resp, err := rss.Read(fetcher.FeedURL, false)
|
|
if err != nil {
|
|
log.Panicln(err)
|
|
}
|
|
|
|
feed, err := rss.Regular(resp)
|
|
if err != nil {
|
|
log.Panicln(err)
|
|
}
|
|
|
|
for _, entry := range feed.Item {
|
|
|
|
var parsedDate time.Time
|
|
|
|
if len(fetcher.PubDateFormat) != 0 {
|
|
parsedDate, err = entry.PubDate.ParseWithFormat(fetcher.PubDateFormat)
|
|
} else {
|
|
parsedDate, err = entry.PubDate.Parse()
|
|
}
|
|
if err != nil {
|
|
log.Panicln(err)
|
|
}
|
|
|
|
formatted_pub_date := parsedDate.Format(DefaultDateFormat)
|
|
|
|
_, err = fetcher.DB.Exec(`
|
|
INSERT INTO notas(medio, title, content, link, publication_date)
|
|
VALUES(?, ?, ?, ?, ?)
|
|
ON CONFLICT(link) DO UPDATE SET
|
|
title=excluded.title,
|
|
content=excluded.content;
|
|
`, fetcher.MedioName, entry.Title, entry.ContentEncoded, entry.Link, formatted_pub_date)
|
|
if err != nil {
|
|
log.Println("Error when saving nota", err)
|
|
}
|
|
}
|
|
log.Printf("[%s/%s] Procesé %d notas", fetcher.MedioName, fetcher.SubsetName, len(feed.Item))
|
|
}
|
|
|
|
func Pagina12(db *sql.DB, subsetName string, url string) GenericRSSFetcher {
|
|
return GenericRSSFetcher{
|
|
MedioName: "pagina_12_rss",
|
|
SubsetName: subsetName,
|
|
FeedURL: url,
|
|
DB: db,
|
|
PubDateFormat: time.RFC1123,
|
|
}
|
|
}
|
|
|
|
func cronologicalFetcher(db *sql.DB) {
|
|
for true {
|
|
GenericRSSFetcher{
|
|
MedioName: "la_nacion_rss",
|
|
FeedURL: "https://www.lanacion.com.ar/arcio/rss/",
|
|
DB: db,
|
|
}.Fetch()
|
|
GenericRSSFetcher{
|
|
MedioName: "infobae",
|
|
FeedURL: "https://www.infobae.com/feeds/rss/",
|
|
DB: db,
|
|
}.Fetch()
|
|
Pagina12(db, "portada", "https://www.pagina12.com.ar/rss/portada").Fetch()
|
|
Pagina12(db, "edicion-impresa", "https://www.pagina12.com.ar/rss/edicion-impresa").Fetch()
|
|
Pagina12(db, "secciones/el-pais", "https://www.pagina12.com.ar/rss/secciones/el-pais/notas").Fetch()
|
|
Pagina12(db, "secciones/sociedad", "https://www.pagina12.com.ar/rss/secciones/sociedad/notas").Fetch()
|
|
time.Sleep(time.Minute * 2)
|
|
}
|
|
|
|
}
|