From 3d0afc953ee0db0dc1d950724a0f9cd342e15361 Mon Sep 17 00:00:00 2001 From: Markus Koch Date: Thu, 30 Apr 2020 18:29:53 +0200 Subject: Extract categories, description, and thumbnail from rendered HTML pages --- scripts/geojson/fetch_all_points.sh | 19 +++----------- scripts/geojson/get_rendered_meta.sh | 49 ++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 16 deletions(-) create mode 100755 scripts/geojson/get_rendered_meta.sh diff --git a/scripts/geojson/fetch_all_points.sh b/scripts/geojson/fetch_all_points.sh index 81c46e0..d1ec504 100755 --- a/scripts/geojson/fetch_all_points.sh +++ b/scripts/geojson/fetch_all_points.sh @@ -22,22 +22,9 @@ for line in $data; do fi if [[ "$title" != "" && "$coords" != "" ]]; then echo "{\"type\": \"Feature\", \"properties\": {\"name\": \"$title\"," - echo -n "\"categories\": [" - urltitle=`echo "$title" | sed 's/ /%20/g'` - cjson=`curl -s "https://wiki.linux-forks.de/mediawiki/api.php?action=query&format=json&titles=$urltitle&prop=categories" | json_reformat -m` - categories=`echo "$cjson" | sed -n 's/.*\("categories":[^]]\+\).*/\1/p'`; - IFS='}' - fc="true"; - for entry in $categories; do - if [[ "$fc" != "true" ]]; then - echo -n ","; - fi - category=`echo "$entry" | sed -n 's/.*Category:\([^"]\+\).*/\1/p'` - echo -n "\"$category\""; - fc="false"; - done; - IFS=$'\n' - echo "]},"; + urltitle=`echo "$title" | sed 's/ /_/g'` + ./get_rendered_meta.sh "https://wiki.linux-forks.de/mediawiki/index.php/$urltitle" + echo "},"; echo "\"geometry\": {\"type\": \"Point\", \"coordinates\": [$coords]}},"; title=""; coords=""; diff --git a/scripts/geojson/get_rendered_meta.sh b/scripts/geojson/get_rendered_meta.sh new file mode 100755 index 0000000..120645c --- /dev/null +++ b/scripts/geojson/get_rendered_meta.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +BASE="https://wiki.linux-forks.de" + +mode="find_description" + +thumbnail="" +categories="" + +data=`curl -s "$1"` + +temp=`echo "$data" | sed -n 's/

\(.\+\).*/\1/p' | head -n1` +if [ "$temp" != "" ]; then + temp=`echo "$temp" | sed "s#href=\"#href=\"$BASE#g" | sed 's/"/\\\\"/g' | sed 's/\t//g'` + description="$temp" + mode="find_infobox"; +fi + +IFS=$'>'; +for line in $data; do + if [ "$mode" == "find_infobox" ]; then + if [ "`echo \"$line\" | grep 'infobox'`" != "" ]; then + mode="image"; + fi + elif [ "$mode" == "image" ]; then + temp=`echo "$line" | sed -n 's/.*img.*src="\([^"]\+\).*/\1/p'`; + if [ "$temp" != "" ]; then + thumbnail="$BASE$temp" + mode="find_cat" + fi + elif [ "$mode" == "find_cat" ]; then + if [ "`echo \"$line\" | grep 'mw-normal-catlinks'`" != "" ]; then + mode="cat"; + fi + elif [ "$mode" == "cat" ]; then + temp=`echo "$line" | sed -n 's/.*title="Category:\([^"]\+\).*/\1/pg' | grep -v 'page does not exist'` + if [ "$temp" != "" ]; then + if [ "$categories" != "" ]; then + categories="$categories," + fi + categories="$categories\"$temp\"" + fi + fi +done +IFS=" "; + +echo "\"categories\": [$categories]," +echo "\"image\": \"$thumbnail\"," +echo "\"description\": \"$description\"" -- cgit v1.2.3