%load_ext rpy2.ipython
from paws.YuviPanda.replicahelper import sql
jupyter-another-article-36267168
query = """
SELECT article.page_id AS page_id,
       article.page_title AS title,
       category.project AS project
FROM (
  SELECT IF(LENGTH(IFNULL(imp.project, cls.project)) > LENGTH(IFNULL(cls.project, imp.project)),
            IFNULL(cls.project, imp.project),
            IFNULL(imp.project, cls.project)
          ) AS project,
          back.page_id AS talk_page
  FROM
      (SELECT cl_from as page_id,
              cl_to as full_text
       FROM categorylinks AS cl
       WHERE cl.cl_to LIKE "%importance\_%\_articles%" OR cl.cl_to LIKE "%Class\_%\_articles%") as back
  LEFT JOIN
      (SELECT cl_from as page_id,
              cl_to as full_text,
              SUBSTRING(cl_to,
                  LOCATE('importance_', cl_to) + 11,
                  LOCATE('_articles', cl_to) - (LOCATE('importance_', cl_to) + 11)) AS project
      FROM categorylinks AS cl
      WHERE cl.cl_to LIKE "%importance\_%\_articles%") as imp
  ON
      back.page_id = imp.page_id AND back.full_text = imp.full_text
  LEFT JOIN
  (SELECT cl_from as page_id,
          cl_to as full_text,
          SUBSTRING(cl_to,
          LOCATE('Class_', cl_to) + 6,
          LOCATE('_articles', cl_to) - (LOCATE('Class_', cl_to) + 6)) AS project
  FROM categorylinks AS cl WHERE cl.cl_to LIKE "%Class\_%\_articles%") as cls
  ON
      back.page_id = cls.page_id AND back.full_text = cls.full_text) as category
JOIN page AS tp ON
  tp.page_id = category.talk_page AND tp.page_namespace = 1
JOIN page AS article ON
  article.page_title = tp.page_title AND
  article.page_namespace = 0;"""
headers = ['page_id', 'title', 'project']
query = """
SELECT DISTINCT IF(LENGTH(IFNULL(imp.project, cls.project)) > LENGTH(IFNULL(cls.project, imp.project)),
                   IFNULL(cls.project, imp.project),
                   IFNULL(imp.project, cls.project)
               ) AS project
FROM
    (SELECT cl_from as page_id,
            cl_to as full_text
     FROM categorylinks AS cl
     WHERE cl.cl_to LIKE "%importance\_%\_articles%" OR cl.cl_to LIKE "%Class\_%\_articles%") as back
LEFT JOIN
    (SELECT cl_from as page_id,
            cl_to as full_text,
            SUBSTRING(cl_to,
                LOCATE('importance_', cl_to) + 11,
                LOCATE('_articles', cl_to) - (LOCATE('importance_', cl_to) + 11)) AS project
    FROM categorylinks AS cl
    WHERE cl.cl_to LIKE "%importance\_%\_articles%") as imp
ON
    back.page_id = imp.page_id AND back.full_text = imp.full_text
LEFT JOIN
(SELECT cl_from as page_id,
        cl_to as full_text,
        SUBSTRING(cl_to,
        LOCATE('Class_', cl_to) + 6,
        LOCATE('_articles', cl_to) - (LOCATE('Class_', cl_to) + 6)) AS project
FROM categorylinks AS cl WHERE cl.cl_to LIKE "%Class\_%\_articles%") as cls
ON
    back.page_id = cls.page_id AND back.full_text = cls.full_text;
    """
headers = ['project']
def undict(headers, data):
    for item in data:
        yield [item[h] for h in headers]

import csv
with open("page_project.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(undict(headers, sql(query)))