diff Apache_Solr_Document.php @ 0:a2b4f67e73dc default tip

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Mon, 08 Jun 2015 10:21:54 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Apache_Solr_Document.php	Mon Jun 08 10:21:54 2015 +0200
@@ -0,0 +1,410 @@
+<?php
+/**
+ * Copyright (c) 2007-2009, Conduit Internet Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name of Conduit Internet Technologies, Inc. nor the names of
+ *    its contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @copyright Copyright 2007-2009 Conduit Internet Technologies, Inc. (http://conduit-it.com)
+ * @license New BSD (http://solr-php-client.googlecode.com/svn/trunk/COPYING)
+ * @version $Id: Document.php 15 2009-08-04 17:53:08Z donovan.jimenez $
+ *
+ * @package Apache
+ * @subpackage Solr
+ * @author Donovan Jimenez <djimenez@conduit-it.com>
+ */
+
+/**
+ * Additional code Copyright (c) 2011 by Peter Wolanin, and 
+ * additional contributors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program as the file LICENSE.txt; if not, please see
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ */ 
+
+/**
+ * Holds Key / Value pairs that represent a Solr Document along with any associated boost
+ * values. Field values can be accessed by direct dereferencing such as:
+ *
+ * @code
+ * $document->title = 'Something';
+ * echo $document->title;
+ *
+ * Additionally, the field values can be iterated with foreach
+ *
+ * @code
+ *   foreach ($document as $fieldName => $fieldValue) {
+ *   ...
+ *   }
+ * </code>
+ */
+class ApacheSolrDocument implements IteratorAggregate {
+
+  /**
+   * Document boost value
+   *
+   * @var float
+   */
+  protected $_documentBoost = FALSE;
+
+  /**
+   * Document field values, indexed by name
+   *
+   * @var array
+   */
+  protected $_fields = array();
+
+  /**
+   * Document field boost values, indexed by name
+   *
+   * @var array array of floats
+   */
+  protected $_fieldBoosts = array();
+
+  /**
+   * Clear all boosts and fields from this document
+   */
+  public function clear() {
+    $this->_documentBoost = FALSE;
+
+    $this->_fields = array();
+    $this->_fieldBoosts = array();
+  }
+
+  /**
+   * Get current document boost
+   *
+   * @return mixed
+   *   will be false for default, or else a float
+   */
+  public function getBoost() {
+    return $this->_documentBoost;
+  }
+
+  /**
+   * Set document boost factor
+   *
+   * @param mixed $boost
+   *   Use false for default boost, else cast to float that should be > 0 or will be treated as false
+   */
+  public function setBoost($boost) {
+    $boost = (float) $boost;
+
+    if ($boost > 0.0) {
+      $this->_documentBoost = $boost;
+    }
+    else {
+      $this->_documentBoost = FALSE;
+    }
+  }
+
+  /**
+   * Add a value to a multi-valued field
+   *
+   * NOTE: the solr XML format allows you to specify boosts
+   * PER value even though the underlying Lucene implementation
+   * only allows a boost per field. To remedy this, the final
+   * field boost value will be the product of all specified boosts
+   * on field values - this is similar to SolrJ's functionality.
+   *
+   * @code
+   *   $doc = new ApacheSolrDocument();
+   *   $doc->addField('foo', 'bar', 2.0);
+   *   $doc->addField('foo', 'baz', 3.0);
+   *   // resultant field boost will be 6!
+   *   echo $doc->getFieldBoost('foo');
+   *
+   * @param string $key
+   * @param mixed $value
+   * @param mixed $boost
+   *   Use false for default boost, else cast to float that should be > 0 or will be treated as false
+   */
+  public function addField($key, $value, $boost = FALSE) {
+    if (!isset($this->_fields[$key])) {
+      // create holding array if this is the first value
+      $this->_fields[$key] = array();
+    }
+    else if (!is_array($this->_fields[$key])) {
+      // move existing value into array if it is not already an array
+      $this->_fields[$key] = array($this->_fields[$key]);
+    }
+
+    if ($this->getFieldBoost($key) === FALSE) {
+      // boost not already set, set it now
+      $this->setFieldBoost($key, $boost);
+    }
+    else if ((float) $boost > 0.0) {
+      // multiply passed boost with current field boost - similar to SolrJ implementation
+      $this->_fieldBoosts[$key] *= (float) $boost;
+    }
+
+    // add value to array
+    $this->_fields[$key][] = $value;
+  }
+
+  /**
+   * Handle the array manipulation for a multi-valued field
+   *
+   * @param string $key
+   * @param string $value
+   * @param mixed $boost
+   *   Use false for default boost, else cast to float that should be > 0 or will be treated as false
+   *
+   * @deprecated Use addField(...) instead
+   */
+  public function setMultiValue($key, $value, $boost = FALSE) {
+    $this->addField($key, $value, $boost);
+  }
+
+  /**
+   * Get field information
+   *
+   * @param string $key
+   * @return mixed associative array of info if field exists, false otherwise
+   */
+  public function getField($key) {
+    if (isset($this->_fields[$key])) {
+      return array(
+        'name' => $key,
+        'value' => $this->_fields[$key],
+        'boost' => $this->getFieldBoost($key)
+      );
+    }
+
+    return FALSE;
+  }
+
+  /**
+   * Set a field value. Multi-valued fields should be set as arrays
+   * or instead use the addField(...) function which will automatically
+   * make sure the field is an array.
+   *
+   * @param string $key
+   * @param mixed $value
+   * @param mixed $boost
+   *   Use false for default boost, else cast to float that should be > 0 or will be treated as false
+   */
+  public function setField($key, $value, $boost = FALSE) {
+    $this->_fields[$key] = $value;
+    $this->setFieldBoost($key, $boost);
+  }
+
+  /**
+   * Get the currently set field boost for a document field
+   *
+   * @param string $key
+   * @return float
+   *   currently set field boost, false if one is not set
+   */
+  public function getFieldBoost($key) {
+    return isset($this->_fieldBoosts[$key]) ? $this->_fieldBoosts[$key] : FALSE;
+  }
+
+  /**
+   * Set the field boost for a document field
+   *
+   * @param string $key
+   *   field name for the boost
+   * @param mixed $boost
+   *   Use false for default boost, else cast to float that should be > 0 or will be treated as false
+   */
+  public function setFieldBoost($key, $boost) {
+    $boost = (float) $boost;
+
+    if ($boost > 0.0) {
+      $this->_fieldBoosts[$key] = $boost;
+    }
+    else {
+      $this->_fieldBoosts[$key] = FALSE;
+    }
+  }
+
+  /**
+   * Return current field boosts, indexed by field name
+   *
+   * @return array
+   */
+  public function getFieldBoosts() {
+    return $this->_fieldBoosts;
+  }
+
+  /**
+   * Get the names of all fields in this document
+   *
+   * @return array
+   */
+  public function getFieldNames() {
+    return array_keys($this->_fields);
+  }
+
+  /**
+   * Get the values of all fields in this document
+   *
+   * @return array
+   */
+  public function getFieldValues() {
+    return array_values($this->_fields);
+  }
+
+  /**
+   * IteratorAggregate implementation function. Allows usage:
+   *
+   * @code
+   *   foreach ($document as $key => $value) {
+   *     ...
+   *   }
+   *
+   */
+  public function getIterator() {
+    $arrayObject = new ArrayObject($this->_fields);
+
+    return $arrayObject->getIterator();
+  }
+
+  /**
+   * Magic get for field values
+   *
+   * @param string $key
+   * @return mixed
+   */
+  public function __get($key) {
+    return $this->_fields[$key];
+  }
+
+  /**
+   * Magic set for field values. Multi-valued fields should be set as arrays
+   * or instead use the addField(...) function which will automatically
+   * make sure the field is an array.
+   *
+   * @param string $key
+   * @param mixed $value
+   */
+  public function __set($key, $value) {
+    $this->setField($key, $value);
+  }
+
+  /**
+   * Magic isset for fields values.  Do not call directly. Allows usage:
+   *
+   * @code
+   *   isset($document->some_field);
+   *
+   * @param string $key
+   * @return boolean
+   *   Whether the given key is set in the document
+   */
+  public function __isset($key) {
+    return isset($this->_fields[$key]);
+  }
+
+  /**
+   * Magic unset for field values. Do not call directly. Allows usage:
+   *
+   * @code
+   *   unset($document->some_field);
+   *
+   * @param string $key
+   */
+  public function __unset($key) {
+    unset($this->_fields[$key]);
+    unset($this->_fieldBoosts[$key]);
+  }
+
+  /**
+   * Create an XML fragment from a ApacheSolrDocument instance appropriate for use inside a Solr add call
+   *
+   * @param ApacheSolrDocument $document
+   *
+   * @return string
+   *   an xml formatted string from the given document
+   */
+  public static function documentToXml(ApacheSolrDocument $document) {
+    $xml = '<doc';
+
+    if ($document->getBoost() !== FALSE) {
+      $xml .= ' boost="' . $document->getBoost() . '"';
+    }
+
+    $xml .= '>';
+
+    foreach ($document as $key => $value) {
+      $key = htmlspecialchars($key, ENT_QUOTES, 'UTF-8');
+      $fieldBoost = $document->getFieldBoost($key);
+
+      if (is_array($value)) {
+        foreach ($value as $multivalue) {
+          $xml .= '<field name="' . $key . '"';
+
+          if ($fieldBoost !== FALSE) {
+            $xml .= ' boost="' . $fieldBoost . '"';
+
+            // Only set the boost for the first field in the set
+            $fieldBoost = FALSE;
+          }
+
+          $xml .= '>' . htmlspecialchars($multivalue, ENT_NOQUOTES, 'UTF-8') . '</field>';
+        }
+      }
+      else {
+        $xml .= '<field name="' . $key . '"';
+
+        if ($fieldBoost !== FALSE) {
+          $xml .= ' boost="' . $fieldBoost . '"';
+        }
+
+        $xml .= '>' . htmlspecialchars($value, ENT_NOQUOTES, 'UTF-8') . '</field>';
+      }
+    }
+
+    $xml .= '</doc>';
+
+    // Remove any control characters to avoid Solr XML parser exception
+    return self::stripCtrlChars($xml);
+  }
+
+  /**
+   * Replace control (non-printable) characters from string that are invalid to Solr's XML parser with a space.
+   *
+   * @param string $string
+   * @return string
+   */
+  public static function stripCtrlChars($string) {
+    // See:  http://w3.org/International/questions/qa-forms-utf-8.html
+    // Printable utf-8 does not include any of these chars below x7F
+    return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $string);
+  }
+}
\ No newline at end of file