#AI · RapidMiner Text Mining Tutorials Web Mining · 2018-02-20 · Thomas Ott

~2 min read

Here's a fast and simple process to extract Ernst Hemingway Quotes from Goodreads. The process is not done, I still need to loop over each quote and add 1 day to the %{now} macro. The goal is to then write them in markdown with %{now}+1 day and auto schedule them on my other website (thomasott.io).

Right now the Goodreads.com <http://goodreads.com>__ web structure is easy to extract but I suspect they'll make it harder one day.

<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.1.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
        <list key="attribute_values">
          <parameter key="get_date" value="date_now()"/>
        </list>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="date_to_nominal" compatibility="8.1.000" expanded="true" height="82" name="Date to Nominal" width="90" x="179" y="34">
        <parameter key="attribute_name" value="get_date"/>
        <parameter key="date_format" value="yyyy-MM-dd"/>
      </operator>
      <operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="313" y="34">
        <parameter key="macro" value="now"/>
        <parameter key="macro_type" value="data_value"/>
        <parameter key="attribute_name" value="get_date"/>
        <parameter key="example_index" value="1"/>
        <list key="additional_macros"/>
      </operator>
      <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="447" y="34">
        <parameter key="url" value="https://www.goodreads.com/work/quotes/2459084-a-moveable-feast"/>
        <list key="query_parameters"/>
        <list key="request_properties"/>
        <description align="center" color="transparent" colored="false" width="126">Read Moveable Feast Quotes</description>
      </operator>
      <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="581" y="34">
        <list key="string_machting_queries">
          <parameter key="MoveableFeastQuotes" value="<div class=&quot;quoteText&quot;>.</div>"/>
        </list>
        <list key="regular_expression_queries"/>
        <list key="regular_region_queries"/>
        <list key="xpath_queries">
          <parameter key="<div class =&quot;quoteText&quot;>" value="</div>"/>
        </list>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34"/>
          <operator activated="true" class="text:extract_information" compatibility="8.1.000" expanded="true" height="68" name="Extract Information" width="90" x="179" y="34">
            <parameter key="query_type" value="Regular Expression"/>
            <list key="string_machting_queries">
              <parameter key="QuoteText" value="&quot;.&quot;"/>
            </list>
            <list key="regular_expression_queries">
              <parameter key="QuoteText" value="\&quot;.*\&quot;"/>
            </list>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
          </operator>
          <operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data" width="90" x="313" y="34">
            <parameter key="text_attribute" value="ExtractedText"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Quote" width="90" x="447" y="34">
            <parameter key="macro" value="Quote"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="ExtractedText"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="text:create_document" compatibility="8.1.000" expanded="true" height="68" name="Create Document" width="90" x="648" y="34">
            <parameter key="text" value="Title: Hemingway Quote for %{now}&#10;Date: %{now}&#10;&#10;#Hemingway Quote for %{now}&#10;&#10;%{Quote}"/>
          </operator>
          <connect from_port="segment" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_op="Extract Information" to_port="document"/>
          <connect from_op="Extract Information" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Extract Quote" to_port="example set"/>
          <connect from_op="Create Document" from_port="output" to_port="document 1"/>
          <portSpacing port="source_segment" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Generate Data by User Specification" from_port="output" to_op="Date to Nominal" to_port="example set input"/>
      <connect from_op="Date to Nominal" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
      <connect from_op="Get Page" from_port="output" to_op="Cut Document" to_port="document"/>
      <connect from_op="Cut Document" from_port="documents" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>