Empezar

2021-11-30 20:14:19 -03:00 · 2021-11-30 20:14:19 -03:00 · 43d4b4aa15
commit 43d4b4aa15
6 changed files with 68 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+datos/
--- a/aria2-feeder/README.md
+++ b/aria2-feeder/README.md
@ -0,0 +1,10 @@
+Le inserta urls a [aria2](https://aria2.github.io) via su interfáz JSON RPC (via HTTP GET) [https://aria2.github.io/manual/en/html/aria2c.html#json-rpc-using-http-get](https://aria2.github.io/manual/en/html/aria2c.html#json-rpc-using-http-get). Pasa `accept-encoding: gzip, br` así se descargan comprimidos.
+
+## Usar
+
+```sh
+# Para una URL
+./feed-aria2.sh "https://URL"
+# Para un archivo con una URL por línea
+lua feed-aria2.lua "archivo"
+```
--- a/aria2-feeder/feed-aria2.lua
+++ b/aria2-feeder/feed-aria2.lua
@ -0,0 +1,12 @@
+local file = arg[1]
+local handle = io.open(file)
+
+while 1 do
+	local line = handle:read("*l")
+	if not line then break end
+	local result = os.execute("./feed-aria2.sh '"..line.."'")
+	if not result == 0 then
+		return
+	end
+end
+
--- a/aria2-feeder/feed-aria2.sh
+++ b/aria2-feeder/feed-aria2.sh
@ -0,0 +1,5 @@
+#!/bin/sh -e
+
+params=$(echo -n "[[\"$1\"],{\"header\":[\"accept-encoding: gzip, br\"]}]" | base64 -w0)
+id=$(echo "$RANDOM * $RANDOM" | bc)
+curl -q "localhost:6800/jsonrpc?method=aria2.addUri&id=$id&params=$params"
--- a/otros-scripts/scrap-pages.lua
+++ b/otros-scripts/scrap-pages.lua
@ -0,0 +1,16 @@
+-- ¡Ey! No uses esto. Se usó inicalmente para descargar la lista de las páginas de subtitulos, pero es bastante ineficiente. Mejor usá aria2-feeder
+local start = tonumber(arg[1])
+local endv = tonumber(arg[2])
+
+if not start or not endv then
+	return
+end
+
+for i = start, endv, 1 do
+	local command = "wget -nv 'https://www.subdivx.com/index.php?pg="..i.."' --header='accept-encoding: gzip, br' -O index-"..i..".htm"
+	local result = os.execute(command)
+	if not result == 0 then
+		return
+	end
+end
+
--- a/otros-scripts/scrap-subtitle-urls.lua
+++ b/otros-scripts/scrap-subtitle-urls.lua
@ -0,0 +1,24 @@
+-- Chupa las URLs de páginas de subtitulos de la lista de subtitulos. Requiere de lynx.
+-- Ejemplo:
+--   for i in index-*.htm; do lua scrap-subtitle-urls.lua $i; done > urls
+
+local file = arg[1]
+local handle = io.popen("lynx -dump -width=1000 "..file)
+local result = handle:read("*a")
+handle:close()
+-- local pattern = "href=\"(https://www.subdivx.com/X6[%wÃ±-]+%.html)\">"
+-- local pattern = "(https://www.subdivx.com/X6[%w%%-`%[%]%{%}%(%)%+']+%.html)"
+-- Me rendí intentando conseguir patrones específicos, mejor separo por línea y matcheo todo lo parecido a una URL de subtitulos
+local pattern = "%. (https://www.subdivx.com/X6.+%.html)"
+local count = 0
+for line in result:gmatch("([^\n]*)\n?") do
+	for url in line:gmatch(pattern) do
+		print(url)
+		count = count + 1
+	end
+end
+
+-- Para verificar que estemos tomando todos los enlaces; ya no es necesario
+-- if not (count == 100) then
+-- 	print(file.." tiene "..count.." subtitulos")
+-- end