diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c370cb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +test.db diff --git a/fetcher.go b/fetcher.go new file mode 100644 index 0000000..d4468bb --- /dev/null +++ b/fetcher.go @@ -0,0 +1,90 @@ +package main + +import ( + "database/sql" + "log" + "time" + + rss "github.com/ungerik/go-rss" +) + +type GenericRSSFetcher struct { + // MedioName es el nombre para poner en la BD (field "medio") + MedioName string + FeedURL string + DB *sql.DB + + // SubsetName es un nombre para un feed específico que se usa para debug + SubsetName string + + PubDateFormat string +} + +// SQLiteTimestampFormats[0] +const DefaultDateFormat = "2006-01-02 15:04:05.999999999-07:00" + +// TODO: Capturar errores +func (fetcher GenericRSSFetcher) Fetch() { + resp, err := rss.Read(fetcher.FeedURL, false) + if err != nil { + log.Panicln(err) + } + + feed, err := rss.Regular(resp) + if err != nil { + log.Panicln(err) + } + + for _, entry := range feed.Item { + + var parsedDate time.Time + + if len(fetcher.PubDateFormat) != 0 { + parsedDate, err = entry.PubDate.ParseWithFormat(fetcher.PubDateFormat) + } else { + parsedDate, err = entry.PubDate.Parse() + } + if err != nil { + log.Panicln(err) + } + + formatted_pub_date := parsedDate.Format(DefaultDateFormat) + + _, err = fetcher.DB.Exec(` + INSERT INTO notas(medio, title, link, publication_date) + VALUES(?, ?, ?, ?) + ON CONFLICT(link) DO UPDATE SET + title=excluded.title; + `, fetcher.MedioName, entry.Title, entry.Link, formatted_pub_date) + if err != nil { + log.Println("Error when saving nota", err) + } + } + log.Printf("[%s/%s] Procesé %d notas", fetcher.MedioName, fetcher.SubsetName, len(feed.Item)) +} + +func Pagina12(db *sql.DB, subsetName string, url string) GenericRSSFetcher { + return GenericRSSFetcher{ + MedioName: "pagina_12_rss", + SubsetName: subsetName, + FeedURL: url, + DB: db, + PubDateFormat: time.RFC1123, + } +} + +func cronologicalFetcher(db *sql.DB) { + for true { + GenericRSSFetcher{ + MedioName: "la_nacion_rss", + FeedURL: "https://www.lanacion.com.ar/arcio/rss/", + DB: db, + }.Fetch() + Pagina12(db, "portada", "https://www.pagina12.com.ar/rss/portada").Fetch() + Pagina12(db, "edicion-impresa", "https://www.pagina12.com.ar/rss/edicion-impresa").Fetch() + Pagina12(db, "secciones/el-pais", "https://www.pagina12.com.ar/rss/secciones/el-pais/notas").Fetch() + Pagina12(db, "secciones/sociedad", "https://www.pagina12.com.ar/rss/secciones/sociedad/notas").Fetch() + time.Sleep(time.Minute * 2) + } + +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c11bc17 --- /dev/null +++ b/go.mod @@ -0,0 +1,9 @@ +module medios.nulo.in/web + +go 1.19 + +require ( + github.com/mattn/go-sqlite3 v1.14.15 // indirect + github.com/paulrosania/go-charset v0.0.0-20190326053356-55c9d7a5834c // indirect + github.com/ungerik/go-rss v0.0.0-20200405130036-81ac15598626 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..1fd2af5 --- /dev/null +++ b/go.sum @@ -0,0 +1,6 @@ +github.com/mattn/go-sqlite3 v1.14.15 h1:vfoHhTN1af61xCRSWzFIWzx2YskyMTwHLrExkBOjvxI= +github.com/mattn/go-sqlite3 v1.14.15/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= +github.com/paulrosania/go-charset v0.0.0-20190326053356-55c9d7a5834c h1:P6XGcuPTigoHf4TSu+3D/7QOQ1MbL6alNwrGhcW7sKw= +github.com/paulrosania/go-charset v0.0.0-20190326053356-55c9d7a5834c/go.mod h1:YnNlZP7l4MhyGQ4CBRwv6ohZTPrUJJZtEv4ZgADkbs4= +github.com/ungerik/go-rss v0.0.0-20200405130036-81ac15598626 h1:kCZvptlArqbfrFWL0V0YWwV9iXEvMKaUMjJvXQ/wsE4= +github.com/ungerik/go-rss v0.0.0-20200405130036-81ac15598626/go.mod h1:UeGmbG1AM6Cx0FfruZOzOC02lDvXbUWV+06tAxgCoGA= diff --git a/main.go b/main.go new file mode 100644 index 0000000..d4aa9e4 --- /dev/null +++ b/main.go @@ -0,0 +1,88 @@ +package main + +import ( + "database/sql" + "fmt" + "html" + "log" + "net/http" + + _ "github.com/mattn/go-sqlite3" +) + +// type Nota struct { +// Medio string `json:"medio"` +// Title string `json:"title"` +// Link string `json:"link"` +// PublicationDate string `json:"publication_date" db:"publication_date"` +// CreatedAt string `json:"created_at" db:"created_at"` +// } + +type fooHandler struct { + DB *sql.DB +} + +func (foo fooHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + + fmt.Fprintf(w, ` + + Observatorio de medios + `) +} + +func main() { + db, err := sql.Open("sqlite3", "./test.db?cache=shared") + if err != nil { + log.Panic(err) + } + defer db.Close() + + db.SetMaxOpenConns(1) + + http.Handle("/foo", fooHandler{db}) + + go cronologicalFetcher(db) + + log.Println("Listening in :8080") + log.Fatal(http.ListenAndServe(":8080", nil)) +} diff --git a/migrations/00001-articulos.sql b/migrations/00001-articulos.sql new file mode 100644 index 0000000..5d5107e --- /dev/null +++ b/migrations/00001-articulos.sql @@ -0,0 +1,8 @@ +CREATE TABLE notas ( + medio TEXT NOT NULL, + title TEXT NOT NULL, + link TEXT NOT NULL, + publication_date DATETIME NOT NULL, + created_at DATETIME NOT NULL DEFAULT current_timestamp, + UNIQUE(link) +);