Code source de functions

import requests
import psycopg2
from lxml import etree
import xmltodict

from SPARQLWrapper import SPARQLWrapper, JSON


[docs] def getLicence(licences): licence = [] if isinstance(licences, dict): return licences["name"] else: for i in licences: licence.append(i["name"]) return "; ".join(licence)
[docs] def main( dbconnexion, cd_refs, WD_MEDIA_PROP, TAXHUB_MEDIA_ID_TYPE, refreshAtlas=True, simulate=True ): # DbMedia Query cur = dbconnexion.cursor() query = """SELECT ?item ?itemLabel ?nomSc ?image ?identifiant_TAXREF WHERE { ?item wdt:P225 ?nomSc. ?item wdt:%s ?image. ?item wdt:P3186 '%s' SERVICE wikibase:label { bd:serviceParam wikibase:language "fr" } } LIMIT 200""" # ajout paramètre agent patch des erreurs 403 # https://www.mediawiki.org/wiki/Topic:V1zau9rqd4ritpug sparql = SPARQLWrapper( "https://query.wikidata.org/sparql", agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", ) sqlI = """INSERT INTO taxonomie.t_medias (cd_ref, titre, url,is_public, id_type, auteur, source, licence) VALUES (%s, '%s', '%s', true, %s, '%s', 'Wikimedia Commons', '%s') """ for cd_ref in cd_refs: try: print("Taxon %s" % cd_ref[0]) sparql.setQuery(query % (WD_MEDIA_PROP, cd_ref[0])) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: if result["image"]["value"]: print(" -- INSERT MEDIAS") print(" ", result["image"]["value"]) # Recuperation des donnees sur commons url = ( "https://tools.wmflabs.org/magnus-toolserver/commonsapi.php?image=%s" % result["image"]["value"].split("Special:FilePath/", 1)[1] ) r = requests.get(url) a = xmltodict.parse(r.content) try: aut = "Commons" try: if len(a["response"]["file"]["author"]) < 500: aut = a["response"]["file"]["author"] except TypeError: print("no author") pass except Exception as e: print("Error during author extraction") print(e) if aut == "Commons": try: if len(a["response"]["file"]["uploader"]) < 500: aut = a["response"]["file"]["uploader"] except TypeError: print("no author") except Exception as e: print("Error during author extraction") print(e) licence = "" if "licenses" in a["response"]: if "license" in a["response"]["licenses"]: licence = getLicence(a["response"]["licenses"]["license"]) sql = sqlI % ( cd_ref[0], a["response"]["file"]["name"], result["image"]["value"], TAXHUB_MEDIA_ID_TYPE, aut, licence, ) if simulate is False: cur.execute(sql) dbconnexion.commit() else: print(sql) except Exception as e: print(" ERREOR") print(e) dbconnexion.rollback() pass except Exception as e: print(e) pass if simulate is False: cur.execute( """ UPDATE taxonomie.t_medias SET id_type = 1 WHERE id_media IN ( SELECT max(id_media) FROM taxonomie.t_medias t LEFT OUTER JOIN (SELECT cd_ref FROM taxonomie.t_medias WHERE id_type = 1) e ON t.cd_ref = e.cd_ref WHERE e.cd_ref IS NULL GROUP BY t.cd_ref ); """ ) if refreshAtlas: cur.execute("REFRESH MATERIALIZED VIEW atlas.vm_medias;") cur.execute("REFRESH MATERIALIZED VIEW atlas.vm_taxons_plus_observes;") dbconnexion.commit()