Add new entity lexers

This commit is contained in:
Daniel Supernault 2018-06-08 21:31:42 -06:00
parent 33ff1f7829
commit 7bb1f10d19
7 changed files with 2698 additions and 0 deletions

771
app/Util/Lexer/Autolink.php Executable file
View file

@ -0,0 +1,771 @@
<?php
/**
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
namespace App\Util\Lexer;
use App\Util\Lexer\Regex;
use App\Util\Lexer\Extractor;
use App\Util\Lexer\StringUtils;
/**
* Twitter Autolink Class
*
* Parses tweets and generates HTML anchor tags around URLs, usernames,
* username/list pairs and hashtags.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class Autolink extends Regex
{
/**
* CSS class for auto-linked URLs.
*
* @var string
*/
protected $class_url = '';
/**
* CSS class for auto-linked username URLs.
*
* @var string
*/
protected $class_user = 'u-url mention';
/**
* CSS class for auto-linked list URLs.
*
* @var string
*/
protected $class_list = 'u-url list-slug';
/**
* CSS class for auto-linked hashtag URLs.
*
* @var string
*/
protected $class_hash = 'u-url hashtag';
/**
* CSS class for auto-linked cashtag URLs.
*
* @var string
*/
protected $class_cash = 'u-url cashtag';
/**
* URL base for username links (the username without the @ will be appended).
*
* @var string
*/
protected $url_base_user = null;
/**
* URL base for list links (the username/list without the @ will be appended).
*
* @var string
*/
protected $url_base_list = null;
/**
* URL base for hashtag links (the hashtag without the # will be appended).
*
* @var string
*/
protected $url_base_hash = null;
/**
* URL base for cashtag links (the hashtag without the $ will be appended).
*
* @var string
*/
protected $url_base_cash = null;
/**
* Whether to include the value 'nofollow' in the 'rel' attribute.
*
* @var bool
*/
protected $nofollow = true;
/**
* Whether to include the value 'noopener' in the 'rel' attribute.
*
* @var bool
*/
protected $noopener = true;
/**
* Whether to include the value 'external' in the 'rel' attribute.
*
* Often this is used to be matched on in JavaScript for dynamically adding
* the 'target' attribute which is deprecated in HTML 4.01. In HTML 5 it has
* been undeprecated and thus the 'target' attribute can be used. If this is
* set to false then the 'target' attribute will be output.
*
* @var bool
*/
protected $external = true;
/**
* The scope to open the link in.
*
* Support for the 'target' attribute was deprecated in HTML 4.01 but has
* since been reinstated in HTML 5. To output the 'target' attribute you
* must disable the adding of the string 'external' to the 'rel' attribute.
*
* @var string
*/
protected $target = '_blank';
/**
* attribute for invisible span tag
*
* @var string
*/
protected $invisibleTagAttrs = "style='position:absolute;left:-9999px;'";
/**
*
* @var Extractor
*/
protected $extractor = null;
/**
* Provides fluent method chaining.
*
* @param string $tweet The tweet to be converted.
* @param bool $full_encode Whether to encode all special characters.
*
* @see __construct()
*
* @return Autolink
*/
public static function create($tweet = null, $full_encode = false)
{
return new static($tweet, $full_encode);
}
/**
* Reads in a tweet to be parsed and converted to contain links.
*
* As the intent is to produce links and output the modified tweet to the
* user, we take this opportunity to ensure that we escape user input.
*
* @see htmlspecialchars()
*
* @param string $tweet The tweet to be converted.
* @param bool $escape Whether to escape the tweet (default: true).
* @param bool $full_encode Whether to encode all special characters.
*/
public function __construct($tweet = null, $escape = true, $full_encode = false)
{
if ($escape && !empty($tweet)) {
if ($full_encode) {
parent::__construct(htmlentities($tweet, ENT_QUOTES, 'UTF-8', false));
} else {
parent::__construct(htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false));
}
} else {
parent::__construct($tweet);
}
$this->extractor = Extractor::create();
$this->url_base_user = config('app.url') . '/';
$this->url_base_list = config('app.url') . '/';
$this->url_base_hash = config('app.url') . "/discover/tags/";
$this->url_base_cash = config('app.url') . '/search?q=%24';
}
/**
* CSS class for auto-linked URLs.
*
* @return string CSS class for URL links.
*/
public function getURLClass()
{
return $this->class_url;
}
/**
* CSS class for auto-linked URLs.
*
* @param string $v CSS class for URL links.
*
* @return Autolink Fluid method chaining.
*/
public function setURLClass($v)
{
$this->class_url = trim($v);
return $this;
}
/**
* CSS class for auto-linked username URLs.
*
* @return string CSS class for username links.
*/
public function getUsernameClass()
{
return $this->class_user;
}
/**
* CSS class for auto-linked username URLs.
*
* @param string $v CSS class for username links.
*
* @return Autolink Fluid method chaining.
*/
public function setUsernameClass($v)
{
$this->class_user = trim($v);
return $this;
}
/**
* CSS class for auto-linked username/list URLs.
*
* @return string CSS class for username/list links.
*/
public function getListClass()
{
return $this->class_list;
}
/**
* CSS class for auto-linked username/list URLs.
*
* @param string $v CSS class for username/list links.
*
* @return Autolink Fluid method chaining.
*/
public function setListClass($v)
{
$this->class_list = trim($v);
return $this;
}
/**
* CSS class for auto-linked hashtag URLs.
*
* @return string CSS class for hashtag links.
*/
public function getHashtagClass()
{
return $this->class_hash;
}
/**
* CSS class for auto-linked hashtag URLs.
*
* @param string $v CSS class for hashtag links.
*
* @return Autolink Fluid method chaining.
*/
public function setHashtagClass($v)
{
$this->class_hash = trim($v);
return $this;
}
/**
* CSS class for auto-linked cashtag URLs.
*
* @return string CSS class for cashtag links.
*/
public function getCashtagClass()
{
return $this->class_cash;
}
/**
* CSS class for auto-linked cashtag URLs.
*
* @param string $v CSS class for cashtag links.
*
* @return Autolink Fluid method chaining.
*/
public function setCashtagClass($v)
{
$this->class_cash = trim($v);
return $this;
}
/**
* Whether to include the value 'nofollow' in the 'rel' attribute.
*
* @return bool Whether to add 'nofollow' to the 'rel' attribute.
*/
public function getNoFollow()
{
return $this->nofollow;
}
/**
* Whether to include the value 'nofollow' in the 'rel' attribute.
*
* @param bool $v The value to add to the 'target' attribute.
*
* @return Autolink Fluid method chaining.
*/
public function setNoFollow($v)
{
$this->nofollow = $v;
return $this;
}
/**
* Whether to include the value 'external' in the 'rel' attribute.
*
* Often this is used to be matched on in JavaScript for dynamically adding
* the 'target' attribute which is deprecated in HTML 4.01. In HTML 5 it has
* been undeprecated and thus the 'target' attribute can be used. If this is
* set to false then the 'target' attribute will be output.
*
* @return bool Whether to add 'external' to the 'rel' attribute.
*/
public function getExternal()
{
return $this->external;
}
/**
* Whether to include the value 'external' in the 'rel' attribute.
*
* Often this is used to be matched on in JavaScript for dynamically adding
* the 'target' attribute which is deprecated in HTML 4.01. In HTML 5 it has
* been undeprecated and thus the 'target' attribute can be used. If this is
* set to false then the 'target' attribute will be output.
*
* @param bool $v The value to add to the 'target' attribute.
*
* @return Autolink Fluid method chaining.
*/
public function setExternal($v)
{
$this->external = $v;
return $this;
}
/**
* The scope to open the link in.
*
* Support for the 'target' attribute was deprecated in HTML 4.01 but has
* since been reinstated in HTML 5. To output the 'target' attribute you
* must disable the adding of the string 'external' to the 'rel' attribute.
*
* @return string The value to add to the 'target' attribute.
*/
public function getTarget()
{
return $this->target;
}
/**
* The scope to open the link in.
*
* Support for the 'target' attribute was deprecated in HTML 4.01 but has
* since been reinstated in HTML 5. To output the 'target' attribute you
* must disable the adding of the string 'external' to the 'rel' attribute.
*
* @param string $v The value to add to the 'target' attribute.
*
* @return Autolink Fluid method chaining.
*/
public function setTarget($v)
{
$this->target = trim($v);
return $this;
}
/**
* Autolink with entities
*
* @param string $tweet
* @param array $entities
* @return string
* @since 1.1.0
*/
public function autoLinkEntities($tweet = null, $entities = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$text = '';
$beginIndex = 0;
foreach ($entities as $entity) {
if (isset($entity['screen_name'])) {
$text .= StringUtils::substr($tweet, $beginIndex, $entity['indices'][0] - $beginIndex + 1);
} else {
$text .= StringUtils::substr($tweet, $beginIndex, $entity['indices'][0] - $beginIndex);
}
if (isset($entity['url'])) {
$text .= $this->linkToUrl($entity);
} elseif (isset($entity['hashtag'])) {
$text .= $this->linkToHashtag($entity, $tweet);
} elseif (isset($entity['screen_name'])) {
$text .= $this->linkToMentionAndList($entity);
} elseif (isset($entity['cashtag'])) {
$text .= $this->linkToCashtag($entity, $tweet);
}
$beginIndex = $entity['indices'][1];
}
$text .= StringUtils::substr($tweet, $beginIndex, StringUtils::strlen($tweet));
return $text;
}
/**
* Auto-link hashtags, URLs, usernames and lists, with JSON entities.
*
* @param string The tweet to be converted
* @param mixed The entities info
* @return string that auto-link HTML added
* @since 1.1.0
*/
public function autoLinkWithJson($tweet = null, $json = null)
{
// concatenate entities
$entities = array();
if (is_object($json)) {
$json = $this->object2array($json);
}
if (is_array($json)) {
foreach ($json as $key => $vals) {
$entities = array_merge($entities, $json[$key]);
}
}
// map JSON entity to twitter-text entity
foreach ($entities as $idx => $entity) {
if (!empty($entity['text'])) {
$entities[$idx]['hashtag'] = $entity['text'];
}
}
$entities = $this->extractor->removeOverlappingEntities($entities);
return $this->autoLinkEntities($tweet, $entities);
}
/**
* convert Object to Array
*
* @param mixed $obj
* @return array
*/
protected function object2array($obj)
{
$array = (array) $obj;
foreach ($array as $key => $var) {
if (is_object($var) || is_array($var)) {
$array[$key] = $this->object2array($var);
}
}
return $array;
}
/**
* Auto-link hashtags, URLs, usernames and lists.
*
* @param string The tweet to be converted
* @return string that auto-link HTML added
* @since 1.1.0
*/
public function autoLink($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = $this->extractor->extractURLWithoutProtocol(false)->extractEntitiesWithIndices($tweet);
return $this->autoLinkEntities($tweet, $entities);
}
/**
* Auto-link the @username and @username/list references in the provided text. Links to @username references will
* have the usernameClass CSS classes added. Links to @username/list references will have the listClass CSS class
* added.
*
* @return string that auto-link HTML added
* @since 1.1.0
*/
public function autoLinkUsernamesAndLists($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = $this->extractor->extractMentionsOrListsWithIndices($tweet);
return $this->autoLinkEntities($tweet, $entities);
}
/**
* Auto-link #hashtag references in the provided Tweet text. The #hashtag links will have the hashtagClass CSS class
* added.
*
* @return string that auto-link HTML added
* @since 1.1.0
*/
public function autoLinkHashtags($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = $this->extractor->extractHashtagsWithIndices($tweet);
return $this->autoLinkEntities($tweet, $entities);
}
/**
* Auto-link URLs in the Tweet text provided.
* <p/>
* This only auto-links URLs with protocol.
*
* @return string that auto-link HTML added
* @since 1.1.0
*/
public function autoLinkURLs($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = $this->extractor->extractURLWithoutProtocol(false)->extractURLsWithIndices($tweet);
return $this->autoLinkEntities($tweet, $entities);
}
/**
* Auto-link $cashtag references in the provided Tweet text. The $cashtag links will have the cashtagClass CSS class
* added.
*
* @return string that auto-link HTML added
* @since 1.1.0
*/
public function autoLinkCashtags($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = $this->extractor->extractCashtagsWithIndices($tweet);
return $this->autoLinkEntities($tweet, $entities);
}
public function linkToUrl($entity)
{
if (!empty($this->class_url)) {
$attributes['class'] = $this->class_url;
}
$attributes['href'] = $entity['url'];
$linkText = $this->escapeHTML($entity['url']);
if (!empty($entity['display_url']) && !empty($entity['expanded_url'])) {
// Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste
// should contain the full original URL (expanded_url), not the display URL.
//
// Method: Whenever possible, we actually emit HTML that contains expanded_url, and use
// font-size:0 to hide those parts that should not be displayed (because they are not part of display_url).
// Elements with font-size:0 get copied even though they are not visible.
// Note that display:none doesn't work here. Elements with display:none don't get copied.
//
// Additionally, we want to *display* ellipses, but we don't want them copied. To make this happen we
// wrap the ellipses in a tco-ellipsis class and provide an onCopy handler that sets display:none on
// everything with the tco-ellipsis class.
//
// As an example: The user tweets "hi http://longdomainname.com/foo"
// This gets shortened to "hi http://t.co/xyzabc", with display_url = "…nname.com/foo"
// This will get rendered as:
// <span class='tco-ellipsis'> <!-- This stuff should get displayed but not copied -->
// …
// <!-- There's a chance the onCopy event handler might not fire. In case that happens,
// we include an &nbsp; here so that the … doesn't bump up against the URL and ruin it.
// The &nbsp; is inside the tco-ellipsis span so that when the onCopy handler *does*
// fire, it doesn't get copied. Otherwise the copied text would have two spaces in a row,
// e.g. "hi http://longdomainname.com/foo".
// <span style='font-size:0'>&nbsp;</span>
// </span>
// <span style='font-size:0'> <!-- This stuff should get copied but not displayed -->
// http://longdomai
// </span>
// <span class='js-display-url'> <!-- This stuff should get displayed *and* copied -->
// nname.com/foo
// </span>
// <span class='tco-ellipsis'> <!-- This stuff should get displayed but not copied -->
// <span style='font-size:0'>&nbsp;</span>
// …
// </span>
//
// Exception: pic.socialhub.dev images, for which expandedUrl = "https://socialhub.dev/#!/username/status/1234/photo/1
// For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts.
// For a pic.socialhub.dev URL, the only elided part will be the "https://", so this is fine.
$displayURL = $entity['display_url'];
$expandedURL = $entity['expanded_url'];
$displayURLSansEllipses = preg_replace('/…/u', '', $displayURL);
$diplayURLIndexInExpandedURL = mb_strpos($expandedURL, $displayURLSansEllipses);
if ($diplayURLIndexInExpandedURL !== false) {
$beforeDisplayURL = mb_substr($expandedURL, 0, $diplayURLIndexInExpandedURL);
$afterDisplayURL = mb_substr($expandedURL, $diplayURLIndexInExpandedURL + mb_strlen($displayURLSansEllipses));
$precedingEllipsis = (preg_match('/\A…/u', $displayURL)) ? '…' : '';
$followingEllipsis = (preg_match('/…\z/u', $displayURL)) ? '…' : '';
$invisibleSpan = "<span {$this->invisibleTagAttrs}>";
$linkText = "<span class='tco-ellipsis'>{$precedingEllipsis}{$invisibleSpan}&nbsp;</span></span>";
$linkText .= "{$invisibleSpan}{$this->escapeHTML($beforeDisplayURL)}</span>";
$linkText .= "<span class='js-display-url'>{$this->escapeHTML($displayURLSansEllipses)}</span>";
$linkText .= "{$invisibleSpan}{$this->escapeHTML($afterDisplayURL)}</span>";
$linkText .= "<span class='tco-ellipsis'>{$invisibleSpan}&nbsp;</span>{$followingEllipsis}</span>";
} else {
$linkText = $entity['display_url'];
}
$attributes['title'] = $entity['expanded_url'];
} elseif (!empty($entity['display_url'])) {
$linkText = $entity['display_url'];
}
return $this->linkToText($entity, $linkText, $attributes);
}
/**
*
* @param array $entity
* @param string $tweet
* @return string
* @since 1.1.0
*/
public function linkToHashtag($entity, $tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$this->target = false;
$attributes = array();
$class = array();
$hash = StringUtils::substr($tweet, $entity['indices'][0], 1);
$linkText = $hash . $entity['hashtag'];
$attributes['href'] = $this->url_base_hash . $entity['hashtag'] . '?src=hash';
$attributes['title'] = '#' . $entity['hashtag'];
if (!empty($this->class_hash)) {
$class[] = $this->class_hash;
}
if (preg_match(self::$patterns['rtl_chars'], $linkText)) {
$class[] = 'rtl';
}
if (!empty($class)) {
$attributes['class'] = join(' ', $class);
}
return $this->linkToText($entity, $linkText, $attributes);
}
/**
*
* @param array $entity
* @return string
* @since 1.1.0
*/
public function linkToMentionAndList($entity)
{
$attributes = array();
if (!empty($entity['list_slug'])) {
# Replace the list and username
$linkText = $entity['screen_name'] . $entity['list_slug'];
$class = $this->class_list;
$url = $this->url_base_list . $linkText;
} else {
# Replace the username
$linkText = $entity['screen_name'];
$class = $this->class_user;
$url = $this->url_base_user . $linkText;
}
if (!empty($class)) {
$attributes['class'] = $class;
}
$attributes['href'] = $url;
return $this->linkToText($entity, $linkText, $attributes);
}
/**
*
* @param array $entity
* @param string $tweet
* @return string
* @since 1.1.0
*/
public function linkToCashtag($entity, $tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$attributes = array();
$doller = StringUtils::substr($tweet, $entity['indices'][0], 1);
$linkText = $doller . $entity['cashtag'];
$attributes['href'] = $this->url_base_cash . $entity['cashtag'];
$attributes['title'] = $linkText;
if (!empty($this->class_cash)) {
$attributes['class'] = $this->class_cash;
}
return $this->linkToText($entity, $linkText, $attributes);
}
/**
*
* @param array $entity
* @param string $text
* @param array $attributes
* @return string
* @since 1.1.0
*/
public function linkToText(array $entity, $text, $attributes = array())
{
$rel = array();
if ($this->external) {
$rel[] = 'external';
}
if ($this->nofollow) {
$rel[] = 'nofollow';
}
if ($this->noopener) {
$rel[] = 'noopener';
}
if (!empty($rel)) {
$attributes['rel'] = join(' ', $rel);
}
if ($this->target) {
$attributes['target'] = $this->target;
}
$link = '<a';
foreach ($attributes as $key => $val) {
$link .= ' ' . $key . '="' . $this->escapeHTML($val) . '"';
}
$link .= '>' . $text . '</a>';
return $link;
}
/**
* html escape
*
* @param string $text
* @return string
*/
protected function escapeHTML($text)
{
return htmlspecialchars($text, ENT_QUOTES, 'UTF-8', false);
}
}

548
app/Util/Lexer/Extractor.php Executable file
View file

@ -0,0 +1,548 @@
<?php
/**
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
namespace App\Util\Lexer;
use App\Util\Lexer\Regex;
use App\Util\Lexer\StringUtils;
/**
* Twitter Extractor Class
*
* Parses tweets and extracts URLs, usernames, username/list pairs and
* hashtags.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class Extractor extends Regex
{
/**
* @var boolean
*/
protected $extractURLWithoutProtocol = true;
/**
* Provides fluent method chaining.
*
* @param string $tweet The tweet to be converted.
*
* @see __construct()
*
* @return Extractor
*/
public static function create($tweet = null)
{
return new self($tweet);
}
/**
* Reads in a tweet to be parsed and extracts elements from it.
*
* Extracts various parts of a tweet including URLs, usernames, hashtags...
*
* @param string $tweet The tweet to extract.
*/
public function __construct($tweet = null)
{
parent::__construct($tweet);
}
/**
* Extracts all parts of a tweet and returns an associative array containing
* the extracted elements.
*
* @param string $tweet The tweet to extract.
* @return array The elements in the tweet.
*/
public function extract($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
return array(
'hashtags' => $this->extractHashtags($tweet),
'urls' => $this->extractURLs($tweet),
'mentions' => $this->extractMentionedUsernames($tweet),
'replyto' => $this->extractRepliedUsernames($tweet),
'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
'urls_with_indices' => $this->extractURLsWithIndices($tweet),
'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
);
}
/**
* Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
*
* @param string $tweet The tweet to extract.
* @return array list of extracted entities
*/
public function extractEntitiesWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = array();
$entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
$entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
$entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
$entities = array_merge($entities, $this->extractCashtagsWithIndices($tweet));
$entities = $this->removeOverlappingEntities($entities);
return $entities;
}
/**
* Extracts all the hashtags from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The hashtag elements in the tweet.
*/
public function extractHashtags($tweet = null)
{
$hashtagsOnly = array();
$hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
foreach ($hashtagsWithIndices as $hashtagWithIndex) {
$hashtagsOnly[] = $hashtagWithIndex['hashtag'];
}
return $hashtagsOnly;
}
/**
* Extracts all the cashtags from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The cashtag elements in the tweet.
*/
public function extractCashtags($tweet = null)
{
$cashtagsOnly = array();
$cashtagsWithIndices = $this->extractCashtagsWithIndices($tweet);
foreach ($cashtagsWithIndices as $cashtagWithIndex) {
$cashtagsOnly[] = $cashtagWithIndex['cashtag'];
}
return $cashtagsOnly;
}
/**
* Extracts all the URLs from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The URL elements in the tweet.
*/
public function extractURLs($tweet = null)
{
$urlsOnly = array();
$urlsWithIndices = $this->extractURLsWithIndices($tweet);
foreach ($urlsWithIndices as $urlWithIndex) {
$urlsOnly[] = $urlWithIndex['url'];
}
return $urlsOnly;
}
/**
* Extract all the usernames from the tweet.
*
* A mention is an occurrence of a username anywhere in a tweet.
*
* @param string $tweet The tweet to extract.
* @return array The usernames elements in the tweet.
*/
public function extractMentionedScreennames($tweet = null)
{
$usernamesOnly = array();
$mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
foreach ($mentionsWithIndices as $mentionWithIndex) {
$screen_name = mb_strtolower($mentionWithIndex['screen_name']);
if (empty($screen_name) OR in_array($screen_name, $usernamesOnly)) {
continue;
}
$usernamesOnly[] = $screen_name;
}
return $usernamesOnly;
}
/**
* Extract all the usernames from the tweet.
*
* A mention is an occurrence of a username anywhere in a tweet.
*
* @return array The usernames elements in the tweet.
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernames($tweet)
{
$this->tweet = $tweet;
return $this->extractMentionedScreennames($tweet);
}
/**
* Extract all the usernames replied to from the tweet.
*
* A reply is an occurrence of a username at the beginning of a tweet.
*
* @param string $tweet The tweet to extract.
* @return array The usernames replied to in a tweet.
*/
public function extractReplyScreenname($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
# Check username ending in
if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
$matched = false;
}
return $matched ? $matches[1] : null;
}
/**
* Extract all the usernames replied to from the tweet.
*
* A reply is an occurrence of a username at the beginning of a tweet.
*
* @return array The usernames replied to in a tweet.
* @deprecated since version 1.1.0
*/
public function extractRepliedUsernames()
{
return $this->extractReplyScreenname();
}
/**
* Extracts all the hashtags and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
* @param boolean $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
* @return array The hashtag elements in the tweet.
*/
public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/[#]/iu', $tweet)) {
return array();
}
preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$tags = array();
foreach ($matches as $match) {
list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, array('', 0));
$start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
$end_position = $start_position + StringUtils::strlen($hash[0] . $hashtag[0]);
if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
continue;
}
$tags[] = array(
'hashtag' => $hashtag[0],
'indices' => array($start_position, $end_position)
);
}
if (!$checkUrlOverlap) {
return $tags;
}
# check url overlap
$urls = $this->extractURLsWithIndices($tweet);
$entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
$validTags = array();
foreach ($entities as $entity) {
if (empty($entity['hashtag'])) {
continue;
}
$validTags[] = $entity;
}
return $validTags;
}
/**
* Extracts all the cashtags and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The cashtag elements in the tweet.
*/
public function extractCashtagsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/\$/iu', $tweet)) {
return array();
}
preg_match_all(self::$patterns['valid_cashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$tags = array();
foreach ($matches as $match) {
list($all, $before, $dollar, $cash_text, $outer) = array_pad($match, 3, array('', 0));
$start_position = $dollar[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $dollar[1])) : $dollar[1];
$end_position = $start_position + StringUtils::strlen($dollar[0] . $cash_text[0]);
if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
continue;
}
$tags[] = array(
'cashtag' => $cash_text[0],
'indices' => array($start_position, $end_position)
);
}
return $tags;
}
/**
* Extracts all the URLs and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The URLs elements in the tweet.
*/
public function extractURLsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$needle = $this->extractURLWithoutProtocol() ? '.' : ':';
if (strpos($tweet, $needle) === false) {
return array();
}
$urls = array();
preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
foreach ($matches as $match) {
list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, array(''));
$start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
$end_position = $start_position + StringUtils::strlen($url[0]);
$all = $all[0];
$before = $before[0];
$url = $url[0];
$protocol = $protocol[0];
$domain = $domain[0];
$port = $port[0];
$path = $path[0];
$query = $query[0];
// If protocol is missing and domain contains non-ASCII characters,
// extract ASCII-only domains.
if (empty($protocol)) {
if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
continue;
}
$last_url = null;
$ascii_end_position = 0;
if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
$asciiDomain[0] = preg_replace('/' . preg_quote($domain, '/') . '/u', $asciiDomain[0], $url);
$ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
$ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
$last_url = array(
'url' => $asciiDomain[0],
'indices' => array($start_position + $ascii_start_position, $start_position + $ascii_end_position),
);
if (!empty($path)
|| preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
|| !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
$urls[] = $last_url;
}
}
// no ASCII-only domain found. Skip the entire URL
if (empty($last_url)) {
continue;
}
// $last_url only contains domain. Need to add path and query if they exist.
if (!empty($path)) {
// last_url was not added. Add it to urls here.
$last_url['url'] = preg_replace('/' . preg_quote($domain, '/') . '/u', $last_url['url'], $url);
$last_url['indices'][1] = $end_position;
}
} else {
// In the case of t.co URLs, don't allow additional path characters
if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
$url = $tcoUrlMatches[0];
$end_position = $start_position + StringUtils::strlen($url);
}
$urls[] = array(
'url' => $url,
'indices' => array($start_position, $end_position),
);
}
}
return $urls;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The username elements in the tweet.
*/
public function extractMentionedScreennamesWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$usernamesOnly = array();
$mentions = $this->extractMentionsOrListsWithIndices($tweet);
foreach ($mentions as $mention) {
if (isset($mention['list_slug'])) {
unset($mention['list_slug']);
}
$usernamesOnly[] = $mention;
}
return $usernamesOnly;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @return array The username elements in the tweet.
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernamesWithIndices()
{
return $this->extractMentionedScreennamesWithIndices();
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
* @return array The username elements in the tweet.
*/
public function extractMentionsOrListsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/[@]/iu', $tweet)) {
return array();
}
preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$results = array();
foreach ($matches as $match) {
list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, array('', 0));
$start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
$end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
$entity = array(
'screen_name' => $username[0],
'list_slug' => $list_slug[0],
'indices' => array($start_position, $end_position),
);
if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
continue;
}
if (!empty($list_slug[0])) {
$entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
}
$results[] = $entity;
}
return $results;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @return array The username elements in the tweet.
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernamesOrListsWithIndices()
{
return $this->extractMentionsOrListsWithIndices();
}
/**
* setter/getter for extractURLWithoutProtocol
*
* @param boolean $flag
* @return Extractor
*/
public function extractURLWithoutProtocol($flag = null)
{
if (is_null($flag)) {
return $this->extractURLWithoutProtocol;
}
$this->extractURLWithoutProtocol = (bool) $flag;
return $this;
}
/**
* Remove overlapping entities.
* This returns a new array with no overlapping entities.
*
* @param array $entities
* @return array
*/
public function removeOverlappingEntities($entities)
{
$result = array();
usort($entities, array($this, 'sortEntites'));
$prev = null;
foreach ($entities as $entity) {
if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
continue;
}
$prev = $entity;
$result[] = $entity;
}
return $result;
}
/**
* sort by entity start index
*
* @param array $a
* @param array $b
* @return int
*/
protected function sortEntites($a, $b)
{
if ($a['indices'][0] == $b['indices'][0]) {
return 0;
}
return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
}
}

202
app/Util/Lexer/HitHighlighter.php Executable file
View file

@ -0,0 +1,202 @@
<?php
/**
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
namespace App\Util\Lexer;
use App\Util\Lexer\Regex;
use App\Util\Lexer\StringUtils;
/**
* Twitter HitHighlighter Class
*
* Performs "hit highlighting" on tweets that have been auto-linked already.
* Useful with the results returned from the search API.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class HitHighlighter extends Regex
{
/**
* The tag to surround hits with.
*
* @var string
*/
protected $tag = 'em';
/**
* Provides fluent method chaining.
*
* @param string $tweet The tweet to be hit highlighted.
* @param bool $full_encode Whether to encode all special characters.
*
* @see __construct()
*
* @return HitHighlighter
*/
public static function create($tweet = null, $full_encode = false)
{
return new self($tweet, $full_encode);
}
/**
* Reads in a tweet to be parsed and hit highlighted.
*
* We take this opportunity to ensure that we escape user input.
*
* @see htmlspecialchars()
*
* @param string $tweet The tweet to be hit highlighted.
* @param bool $escape Whether to escape the tweet (default: true).
* @param bool $full_encode Whether to encode all special characters.
*/
public function __construct($tweet = null, $escape = true, $full_encode = false)
{
if (!empty($tweet) && $escape) {
if ($full_encode) {
parent::__construct(htmlentities($tweet, ENT_QUOTES, 'UTF-8', false));
} else {
parent::__construct(htmlspecialchars($tweet, ENT_QUOTES, 'UTF-8', false));
}
} else {
parent::__construct($tweet);
}
}
/**
* Set the highlighting tag to surround hits with. The default tag is 'em'.
*
* @return string The tag name.
*/
public function getTag()
{
return $this->tag;
}
/**
* Set the highlighting tag to surround hits with. The default tag is 'em'.
*
* @param string $v The tag name.
*
* @return HitHighlighter Fluid method chaining.
*/
public function setTag($v)
{
$this->tag = $v;
return $this;
}
/**
* Hit highlights the tweet.
*
* @param string $tweet The tweet to be hit highlighted.
* @param array $hits An array containing the start and end index pairs
* for the highlighting.
* @param bool $escape Whether to escape the tweet (default: true).
* @param bool $full_encode Whether to encode all special characters.
*
* @return string The hit highlighted tweet.
*/
public function highlight($tweet = null, array $hits = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (empty($hits)) {
return $tweet;
}
$highlightTweet = '';
$tags = array('<' . $this->tag . '>', '</' . $this->tag . '>');
# Check whether we can simply replace or whether we need to chunk...
if (strpos($tweet, '<') === false) {
$ti = 0; // tag increment (for added tags)
$highlightTweet = $tweet;
foreach ($hits as $hit) {
$highlightTweet = StringUtils::substrReplace($highlightTweet, $tags[0], $hit[0] + $ti, 0);
$ti += StringUtils::strlen($tags[0]);
$highlightTweet = StringUtils::substrReplace($highlightTweet, $tags[1], $hit[1] + $ti, 0);
$ti += StringUtils::strlen($tags[1]);
}
} else {
$chunks = preg_split('/[<>]/iu', $tweet);
$chunk = $chunks[0];
$chunk_index = 0;
$chunk_cursor = 0;
$offset = 0;
$start_in_chunk = false;
# Flatten the multidimensional hits array:
$hits_flat = array();
foreach ($hits as $hit) {
$hits_flat = array_merge($hits_flat, $hit);
}
# Loop over the hit indices:
for ($index = 0; $index < count($hits_flat); $index++) {
$hit = $hits_flat[$index];
$tag = $tags[$index % 2];
$placed = false;
while ($chunk !== null && $hit >= ($i = $offset + StringUtils::strlen($chunk))) {
$highlightTweet .= StringUtils::substr($chunk, $chunk_cursor);
if ($start_in_chunk && $hit === $i) {
$highlightTweet .= $tag;
$placed = true;
}
if (isset($chunks[$chunk_index + 1])) {
$highlightTweet .= '<' . $chunks[$chunk_index + 1] . '>';
}
$offset += StringUtils::strlen($chunk);
$chunk_cursor = 0;
$chunk_index += 2;
$chunk = (isset($chunks[$chunk_index]) ? $chunks[$chunk_index] : null);
$start_in_chunk = false;
}
if (!$placed && $chunk !== null) {
$hit_spot = $hit - $offset;
$highlightTweet .= StringUtils::substr($chunk, $chunk_cursor, $hit_spot - $chunk_cursor) . $tag;
$chunk_cursor = $hit_spot;
$start_in_chunk = ($index % 2 === 0);
$placed = true;
}
# Ultimate fallback - hits that run off the end get a closing tag:
if (!$placed) {
$highlightTweet .= $tag;
}
}
if ($chunk !== null) {
if ($chunk_cursor < StringUtils::strlen($chunk)) {
$highlightTweet .= StringUtils::substr($chunk, $chunk_cursor);
}
for ($index = $chunk_index + 1; $index < count($chunks); $index++) {
$highlightTweet .= ($index % 2 === 0 ? $chunks[$index] : '<' . $chunks[$index] . '>');
}
}
}
return $highlightTweet;
}
/**
* Hit highlights the tweet.
*
* @param array $hits An array containing the start and end index pairs
* for the highlighting.
*
* @return string The hit highlighted tweet.
* @deprecated since version 1.1.0
*/
public function addHitHighlighting(array $hits)
{
return $this->highlight($this->tweet, $hits);
}
}

348
app/Util/Lexer/LooseAutolink.php Executable file
View file

@ -0,0 +1,348 @@
<?php
/**
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @author Takashi Nojima
* @copyright Copyright 2014 Mike Cochrane, Nick Pope, Takashi Nojima
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
namespace App\Util\Lexer;
use App\Util\Lexer\Autolink;
/**
* Twitter LooseAutolink Class
*
* Parses tweets and generates HTML anchor tags around URLs, usernames,
* username/list pairs and hashtags.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @author Takashi Nojima
* @copyright Copyright 2014 Mike Cochrane, Nick Pope, Takashi Nojima
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
* @since 1.8.0
* @deprecated since version 1.9.0
*/
class LooseAutolink extends Autolink
{
/**
* Auto-link hashtags, URLs, usernames and lists.
*
* @param string The tweet to be converted
* @return string that auto-link HTML added
* @deprecated since version 1.9.0
*/
public function autoLink($tweet = null)
{
if (!is_null($tweet)) {
$this->tweet = $tweet;
}
return $this->addLinks();
}
/**
* Auto-link the @username and @username/list references in the provided text. Links to @username references will
* have the usernameClass CSS classes added. Links to @username/list references will have the listClass CSS class
* added.
*
* @return string that auto-link HTML added
*/
public function autoLinkUsernamesAndLists($tweet = null)
{
if (!is_null($tweet)) {
$this->tweet = $tweet;
}
return $this->addLinksToUsernamesAndLists();
}
/**
* Auto-link #hashtag references in the provided Tweet text. The #hashtag links will have the hashtagClass CSS class
* added.
*
* @return string that auto-link HTML added
*/
public function autoLinkHashtags($tweet = null)
{
if (!is_null($tweet)) {
$this->tweet = $tweet;
}
return $this->addLinksToHashtags();
}
/**
* Auto-link URLs in the Tweet text provided.
* <p/>
* This only auto-links URLs with protocol.
*
* @return string that auto-link HTML added
*/
public function autoLinkURLs($tweet = null)
{
if (!is_null($tweet)) {
$this->tweet = $tweet;
}
return $this->addLinksToURLs();
}
/**
* Auto-link $cashtag references in the provided Tweet text. The $cashtag links will have the cashtagClass CSS class
* added.
*
* @return string that auto-link HTML added
*/
public function autoLinkCashtags($tweet = null)
{
if (!is_null($tweet)) {
$this->tweet = $tweet;
}
return $this->addLinksToCashtags();
}
/**
* Adds links to all elements in the tweet.
*
* @return string The modified tweet.
* @deprecated since version 1.9.0
*/
public function addLinks()
{
$original = $this->tweet;
$this->tweet = $this->addLinksToURLs();
$this->tweet = $this->addLinksToHashtags();
$this->tweet = $this->addLinksToCashtags();
$this->tweet = $this->addLinksToUsernamesAndLists();
$modified = $this->tweet;
$this->tweet = $original;
return $modified;
}
/**
* Adds links to hashtag elements in the tweet.
*
* @return string The modified tweet.
*/
public function addLinksToHashtags()
{
return preg_replace_callback(
self::$patterns['valid_hashtag'],
array($this, '_addLinksToHashtags'),
$this->tweet
);
}
/**
* Adds links to cashtag elements in the tweet.
*
* @return string The modified tweet.
*/
public function addLinksToCashtags()
{
return preg_replace_callback(
self::$patterns['valid_cashtag'],
array($this, '_addLinksToCashtags'),
$this->tweet
);
}
/**
* Adds links to URL elements in the tweet.
*
* @return string The modified tweet
*/
public function addLinksToURLs()
{
return preg_replace_callback(self::$patterns['valid_url'], array($this, '_addLinksToURLs'), $this->tweet);
}
/**
* Adds links to username/list elements in the tweet.
*
* @return string The modified tweet.
*/
public function addLinksToUsernamesAndLists()
{
return preg_replace_callback(
self::$patterns['valid_mentions_or_lists'],
array($this, '_addLinksToUsernamesAndLists'),
$this->tweet
);
}
/**
* Wraps a tweet element in an HTML anchor tag using the provided URL.
*
* This is a helper function to perform the generation of the link.
*
* @param string $url The URL to use as the href.
* @param string $class The CSS class(es) to apply (space separated).
* @param string $element The tweet element to wrap.
*
* @return string The tweet element with a link applied.
* @deprecated since version 1.1.0
*/
protected function wrap($url, $class, $element)
{
$link = '<a';
if ($class) {
$link .= ' class="' . $class . '"';
}
$link .= ' href="' . $url . '"';
$rel = array();
if ($this->external) {
$rel[] = 'external';
}
if ($this->nofollow) {
$rel[] = 'nofollow';
}
if (!empty($rel)) {
$link .= ' rel="' . implode(' ', $rel) . '"';
}
if ($this->target) {
$link .= ' target="' . $this->target . '"';
}
$link .= '>' . $element . '</a>';
return $link;
}
/**
* Wraps a tweet element in an HTML anchor tag using the provided URL.
*
* This is a helper function to perform the generation of the hashtag link.
*
* @param string $url The URL to use as the href.
* @param string $class The CSS class(es) to apply (space separated).
* @param string $element The tweet element to wrap.
*
* @return string The tweet element with a link applied.
*/
protected function wrapHash($url, $class, $element)
{
$title = preg_replace('//u', '#', $element);
$link = '<a';
$link .= ' href="' . $url . '"';
$link .= ' title="' . $title . '"';
if ($class) {
$link .= ' class="' . $class . '"';
}
$rel = array();
if ($this->external) {
$rel[] = 'external';
}
if ($this->nofollow) {
$rel[] = 'nofollow';
}
if (!empty($rel)) {
$link .= ' rel="' . implode(' ', $rel) . '"';
}
if ($this->target) {
$link .= ' target="' . $this->target . '"';
}
$link .= '>' . $element . '</a>';
return $link;
}
/**
* Callback used by the method that adds links to hashtags.
*
* @see addLinksToHashtags()
* @param array $matches The regular expression matches.
* @return string The link-wrapped hashtag.
*/
protected function _addLinksToHashtags($matches)
{
list($all, $before, $hash, $tag, $after) = array_pad($matches, 5, '');
if (preg_match(self::$patterns['end_hashtag_match'], $after)
|| (!preg_match('!\A["\']!', $before) && preg_match('!\A["\']!', $after)) || preg_match('!\A</!', $after)) {
return $all;
}
$replacement = $before;
$element = $hash . $tag;
$url = $this->url_base_hash . $tag;
$class_hash = $this->class_hash;
if (preg_match(self::$patterns['rtl_chars'], $element)) {
$class_hash .= ' rtl';
}
$replacement .= $this->wrapHash($url, $class_hash, $element);
return $replacement;
}
/**
* Callback used by the method that adds links to cashtags.
*
* @see addLinksToCashtags()
* @param array $matches The regular expression matches.
* @return string The link-wrapped cashtag.
*/
protected function _addLinksToCashtags($matches)
{
list($all, $before, $cash, $tag, $after) = array_pad($matches, 5, '');
if (preg_match(self::$patterns['end_cashtag_match'], $after)
|| (!preg_match('!\A["\']!', $before) && preg_match('!\A["\']!', $after)) || preg_match('!\A</!', $after)) {
return $all;
}
$replacement = $before;
$element = $cash . $tag;
$url = $this->url_base_cash . $tag;
$replacement .= $this->wrapHash($url, $this->class_cash, $element);
return $replacement;
}
/**
* Callback used by the method that adds links to URLs.
*
* @see addLinksToURLs()
* @param array $matches The regular expression matches.
* @return string The link-wrapped URL.
*/
protected function _addLinksToURLs($matches)
{
list($all, $before, $url, $protocol, $domain, $path, $query) = array_pad($matches, 7, '');
$url = htmlspecialchars($url, ENT_QUOTES, 'UTF-8', false);
if (!$protocol) {
return $all;
}
return $before . $this->wrap($url, $this->class_url, $url);
}
/**
* Callback used by the method that adds links to username/list pairs.
*
* @see addLinksToUsernamesAndLists()
* @param array $matches The regular expression matches.
* @return string The link-wrapped username/list pair.
*/
protected function _addLinksToUsernamesAndLists($matches)
{
list($all, $before, $at, $username, $slash_listname, $after) = array_pad($matches, 6, '');
# If $after is not empty, there is an invalid character.
if (!empty($slash_listname)) {
# Replace the list and username
$element = $username . $slash_listname;
$class = $this->class_list;
$url = $this->url_base_list . $element;
} else {
if (preg_match(self::$patterns['end_mention_match'], $after)) {
return $all;
}
# Replace the username
$element = $username;
$class = $this->class_user;
$url = $this->url_base_user . $element;
}
# XXX: Due to use of preg_replace_callback() for multiple replacements in a
# single tweet and also as only the match is replaced and we have to
# use a look-ahead for $after because there is no equivalent for the
# $' (dollar apostrophe) global from Ruby, we MUST NOT append $after.
return $before . $at . $this->wrap($url, $class, $element);
}
}

337
app/Util/Lexer/Regex.php Executable file

File diff suppressed because one or more lines are too long

104
app/Util/Lexer/StringUtils.php Executable file
View file

@ -0,0 +1,104 @@
<?php
/**
* @author Takashi Nojima
* @copyright Copyright 2014, Takashi Nojima
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
namespace App\Util\Lexer;
/**
* String utility
*
* @author Takashi Nojima
* @copyright Copyright 2014, Takashi Nojima
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter
*/
class StringUtils
{
/**
* alias of mb_substr
*
* @param string $str
* @param integer $start
* @param integer $length
* @param string $encoding
* @return string
*/
public static function substr($str, $start, $length = null, $encoding = 'UTF-8')
{
if (is_null($length)) {
// for PHP <= 5.4.7
$length = mb_strlen($str, $encoding);
}
return mb_substr($str, $start, $length, $encoding);
}
/**
* alias of mb_strlen
*
* @param string $str
* @param string $encoding
* @return integer
*/
public static function strlen($str, $encoding = 'UTF-8')
{
return mb_strlen($str, $encoding);
}
/**
* alias of mb_strpos
*
* @param string $haystack
* @param string $needle
* @param integer $offset
* @param string $encoding
* @return integer
*/
public static function strpos($haystack, $needle, $offset = 0, $encoding = 'UTF-8')
{
return mb_strpos($haystack, $needle, $offset, $encoding);
}
/**
* A multibyte-aware substring replacement function.
*
* @param string $string The string to modify.
* @param string $replacement The replacement string.
* @param int $start The start of the replacement.
* @param int $length The number of characters to replace.
* @param string $encoding The encoding of the string.
*
* @return string The modified string.
*
* @see http://www.php.net/manual/en/function.substr-replace.php#90146
*/
public static function substrReplace($string, $replacement, $start, $length = null, $encoding = 'UTF-8')
{
if (extension_loaded('mbstring') === true) {
$string_length = static::strlen($string, $encoding);
if ($start < 0) {
$start = max(0, $string_length + $start);
} elseif ($start > $string_length) {
$start = $string_length;
}
if ($length < 0) {
$length = max(0, $string_length - $start + $length);
} elseif ((is_null($length) === true) || ($length > $string_length)) {
$length = $string_length;
}
if (($start + $length) > $string_length) {
$length = $string_length - $start;
}
$suffixOffset = $start + $length;
$suffixLength = $string_length - $start - $length;
return static::substr($string, 0, $start, $encoding) . $replacement . static::substr($string, $suffixOffset, $suffixLength, $encoding);
}
return (is_null($length) === true) ? substr_replace($string, $replacement, $start) : substr_replace($string, $replacement, $start, $length);
}
}

388
app/Util/Lexer/Validator.php Executable file
View file

@ -0,0 +1,388 @@
<?php
/**
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
namespace App\Util\Lexer;
use App\Util\Lexer\Regex;
use App\Util\Lexer\Extractor;
use App\Util\Lexer\StringUtils;
/**
* Twitter Validator Class
*
* Performs "validation" on tweets.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter.Text
*/
class Validator extends Regex
{
/**
* The maximum length of a tweet.
*
* @var int
*/
const MAX_LENGTH = 140;
/**
* The length of a short URL beginning with http:
*
* @var int
*/
protected $short_url_length = 23;
/**
* The length of a short URL beginning with http:
*
* @var int
*/
protected $short_url_length_https = 23;
/**
*
* @var Extractor
*/
protected $extractor = null;
/**
* Provides fluent method chaining.
*
* @param string $tweet The tweet to be validated.
* @param mixed $config Setup short URL length from Twitter API /help/configuration response.
*
* @see __construct()
*
* @return Validator
*/
public static function create($tweet = null, $config = null)
{
return new self($tweet, $config);
}
/**
* Reads in a tweet to be parsed and validates it.
*
* @param string $tweet The tweet to validate.
*/
public function __construct($tweet = null, $config = null)
{
parent::__construct($tweet);
if (!empty($config)) {
$this->setConfiguration($config);
}
$this->extractor = Extractor::create();
}
/**
* Setup short URL length from Twitter API /help/configuration response
*
* @param mixed $config
* @return Validator
* @link https://dev.twitter.com/docs/api/1/get/help/configuration
*/
public function setConfiguration($config)
{
if (is_array($config)) {
// setup from array
if (isset($config['short_url_length'])) {
$this->setShortUrlLength($config['short_url_length']);
}
if (isset($config['short_url_length_https'])) {
$this->setShortUrlLengthHttps($config['short_url_length_https']);
}
} elseif (is_object($config)) {
// setup from object
if (isset($config->short_url_length)) {
$this->setShortUrlLength($config->short_url_length);
}
if (isset($config->short_url_length_https)) {
$this->setShortUrlLengthHttps($config->short_url_length_https);
}
}
return $this;
}
/**
* Set the length of a short URL beginning with http:
*
* @param mixed $length
* @return Validator
*/
public function setShortUrlLength($length)
{
$this->short_url_length = intval($length);
return $this;
}
/**
* Get the length of a short URL beginning with http:
*
* @return int
*/
public function getShortUrlLength()
{
return $this->short_url_length;
}
/**
* Set the length of a short URL beginning with https:
*
* @param mixed $length
* @return Validator
*/
public function setShortUrlLengthHttps($length)
{
$this->short_url_length_https = intval($length);
return $this;
}
/**
* Get the length of a short URL beginning with https:
*
* @return int
*/
public function getShortUrlLengthHttps()
{
return $this->short_url_length_https;
}
/**
* Check whether a tweet is valid.
*
* @param string $tweet The tweet to validate.
* @return boolean Whether the tweet is valid.
*/
public function isValidTweetText($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$length = $this->getTweetLength($tweet);
if (!$tweet || !$length) {
return false;
}
if ($length > self::MAX_LENGTH) {
return false;
}
if (preg_match(self::$patterns['invalid_characters'], $tweet)) {
return false;
}
return true;
}
/**
* Check whether a tweet is valid.
*
* @return boolean Whether the tweet is valid.
* @deprecated since version 1.1.0
*/
public function validateTweet()
{
return $this->isValidTweetText();
}
/**
* Check whether a username is valid.
*
* @param string $username The username to validate.
* @return boolean Whether the username is valid.
*/
public function isValidUsername($username = null)
{
if (is_null($username)) {
$username = $this->tweet;
}
$length = StringUtils::strlen($username);
if (empty($username) || !$length) {
return false;
}
$extracted = $this->extractor->extractMentionedScreennames($username);
return count($extracted) === 1 && $extracted[0] === substr($username, 1);
}
/**
* Check whether a username is valid.
*
* @return boolean Whether the username is valid.
* @deprecated since version 1.1.0
*/
public function validateUsername()
{
return $this->isValidUsername();
}
/**
* Check whether a list is valid.
*
* @param string $list The list name to validate.
* @return boolean Whether the list is valid.
*/
public function isValidList($list = null)
{
if (is_null($list)) {
$list = $this->tweet;
}
$length = StringUtils::strlen($list);
if (empty($list) || !$length) {
return false;
}
preg_match(self::$patterns['valid_mentions_or_lists'], $list, $matches);
$matches = array_pad($matches, 5, '');
return isset($matches) && $matches[1] === '' && $matches[4] && !empty($matches[4]) && $matches[5] === '';
}
/**
* Check whether a list is valid.
*
* @return boolean Whether the list is valid.
* @deprecated since version 1.1.0
*/
public function validateList()
{
return $this->isValidList();
}
/**
* Check whether a hashtag is valid.
*
* @param string $hashtag The hashtag to validate.
* @return boolean Whether the hashtag is valid.
*/
public function isValidHashtag($hashtag = null)
{
if (is_null($hashtag)) {
$hashtag = $this->tweet;
}
$length = StringUtils::strlen($hashtag);
if (empty($hashtag) || !$length) {
return false;
}
$extracted = $this->extractor->extractHashtags($hashtag);
return count($extracted) === 1 && $extracted[0] === substr($hashtag, 1);
}
/**
* Check whether a hashtag is valid.
*
* @return boolean Whether the hashtag is valid.
* @deprecated since version 1.1.0
*/
public function validateHashtag()
{
return $this->isValidHashtag();
}
/**
* Check whether a URL is valid.
*
* @param string $url The url to validate.
* @param boolean $unicode_domains Consider the domain to be unicode.
* @param boolean $require_protocol Require a protocol for valid domain?
*
* @return boolean Whether the URL is valid.
*/
public function isValidURL($url = null, $unicode_domains = true, $require_protocol = true)
{
if (is_null($url)) {
$url = $this->tweet;
}
$length = StringUtils::strlen($url);
if (empty($url) || !$length) {
return false;
}
preg_match(self::$patterns['validate_url_unencoded'], $url, $matches);
$match = array_shift($matches);
if (!$matches || $match !== $url) {
return false;
}
list($scheme, $authority, $path, $query, $fragment) = array_pad($matches, 5, '');
# Check scheme, path, query, fragment:
if (($require_protocol && !(
self::isValidMatch($scheme, self::$patterns['validate_url_scheme']) && preg_match('/^https?$/i', $scheme))
) || !self::isValidMatch($path, self::$patterns['validate_url_path']) || !self::isValidMatch($query, self::$patterns['validate_url_query'], true)
|| !self::isValidMatch($fragment, self::$patterns['validate_url_fragment'], true)) {
return false;
}
# Check authority:
$authority_pattern = $unicode_domains ? 'validate_url_unicode_authority' : 'validate_url_authority';
return self::isValidMatch($authority, self::$patterns[$authority_pattern]);
}
/**
* Check whether a URL is valid.
*
* @param boolean $unicode_domains Consider the domain to be unicode.
* @param boolean $require_protocol Require a protocol for valid domain?
*
* @return boolean Whether the URL is valid.
* @deprecated since version 1.1.0
*/
public function validateURL($unicode_domains = true, $require_protocol = true)
{
return $this->isValidURL(null, $unicode_domains, $require_protocol);
}
/**
* Determines the length of a tweet. Takes shortening of URLs into account.
*
* @param string $tweet The tweet to validate.
* @return int the length of a tweet.
*/
public function getTweetLength($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$length = StringUtils::strlen($tweet);
$urls_with_indices = $this->extractor->extractURLsWithIndices($tweet);
foreach ($urls_with_indices as $x) {
$length += $x['indices'][0] - $x['indices'][1];
$length += stripos($x['url'], 'https://') === 0 ? $this->short_url_length_https : $this->short_url_length;
}
return $length;
}
/**
* Determines the length of a tweet. Takes shortening of URLs into account.
*
* @return int the length of a tweet.
* @deprecated since version 1.1.0
*/
public function getLength()
{
return $this->getTweetLength();
}
/**
* A helper function to check for a valid match. Used in URL validation.
*
* @param string $string The subject string to test.
* @param string $pattern The pattern to match against.
* @param boolean $optional Whether a match is compulsory or not.
*
* @return boolean Whether an exact match was found.
*/
protected static function isValidMatch($string, $pattern, $optional = false)
{
$found = preg_match($pattern, $string, $matches);
if (!$optional) {
return (($string || $string === '') && $found && $matches[0] === $string);
} else {
return !(($string || $string === '') && (!$found || $matches[0] !== $string));
}
}
}