Extract Ernst Hemingway Quotes from Goodreads

Here's a fast and simple process to extract Ernst Hemingway Quotes from Goodreads. The process is not done, I still need to loop over each quote and add 1 day to the %{now} macro. The goal is to then write them in markdown with %{now}+1 day and auto schedule them on my other website (thomasott.io).

Right now the Goodreads.com web structure is easy to extract but I suspect they'll make it harder one day.

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="8.1.000" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
            <list key="attribute_values">
              <parameter key="get_date" value="date_now()"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="date_to_nominal" compatibility="8.1.000" expanded="true" height="82" name="Date to Nominal" width="90" x="179" y="34">
            <parameter key="attribute_name" value="get_date"/>
            <parameter key="date_format" value="yyyy-MM-dd"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="313" y="34">
            <parameter key="macro" value="now"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="get_date"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="447" y="34">
            <parameter key="url" value="https://www.goodreads.com/work/quotes/2459084-a-moveable-feast"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
            <description align="center" color="transparent" colored="false" width="126">Read Moveable Feast Quotes</description>
          </operator>
          <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="581" y="34">
            <list key="string_machting_queries">
              <parameter key="MoveableFeastQuotes" value="<div class=&quot;quoteText&quot;>.</div>"/>
            </list>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="<div class =&quot;quoteText&quot;>" value="</div>"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34"/>
              <operator activated="true" class="text:extract_information" compatibility="8.1.000" expanded="true" height="68" name="Extract Information" width="90" x="179" y="34">
                <parameter key="query_type" value="Regular Expression"/>
                <list key="string_machting_queries">
                  <parameter key="QuoteText" value="&quot;.&quot;"/>
                </list>
                <list key="regular_expression_queries">
                  <parameter key="QuoteText" value="\&quot;.*\&quot;"/>
                </list>
                <list key="regular_region_queries"/>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
              </operator>
              <operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data" width="90" x="313" y="34">
                <parameter key="text_attribute" value="ExtractedText"/>
              </operator>
              <operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Quote" width="90" x="447" y="34">
                <parameter key="macro" value="Quote"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="ExtractedText"/>
                <parameter key="example_index" value="1"/>
                <list key="additional_macros"/>
              </operator>
              <operator activated="true" class="text:create_document" compatibility="8.1.000" expanded="true" height="68" name="Create Document" width="90" x="648" y="34">
                <parameter key="text" value="Title: Hemingway Quote for %{now}&#10;Date: %{now}&#10;&#10;#Hemingway Quote for %{now}&#10;&#10;%{Quote}"/>
              </operator>
              <connect from_port="segment" to_op="Extract Content" to_port="document"/>
              <connect from_op="Extract Content" from_port="document" to_op="Extract Information" to_port="document"/>
              <connect from_op="Extract Information" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
              <connect from_op="Documents to Data" from_port="example set" to_op="Extract Quote" to_port="example set"/>
              <connect from_op="Create Document" from_port="output" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Date to Nominal" to_port="example set input"/>
          <connect from_op="Date to Nominal" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
          <connect from_op="Get Page" from_port="output" to_op="Cut Document" to_port="document"/>
          <connect from_op="Cut Document" from_port="documents" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
Show Comments