public static function extract_from_content($content, $what_to_extract = self::ALL, $already_extracted = array())
{
$stripped_content = self::get_stripped_content($content);
// Maybe start with some previously extracted things (e.g. images from extract()
$extracted = $already_extracted;
// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
if (self::IMAGES & $what_to_extract) {
$images = Jetpack_Media_Meta_Extractor::extract_images_from_content($stripped_content, array());
$extracted = array_merge($extracted, $images);
}
// ----------------------------------- MENTIONS ------------------------------
if (self::MENTIONS & $what_to_extract) {
if (preg_match_all('/(^|\\s)@(\\w+)/u', $stripped_content, $matches)) {
$mentions = array_values(array_unique($matches[2]));
//array_unique() retains the keys!
$mentions = array_map('strtolower', $mentions);
$extracted['mention'] = array('name' => $mentions);
if (!isset($extracted['has'])) {
$extracted['has'] = array();
}
$extracted['has']['mention'] = count($mentions);
}
}
// ----------------------------------- HASHTAGS ------------------------------
/** Some hosts may not compile with --enable-unicode-properties and kick a warning:
* Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
* Therefore, we only run this code block on wpcom, not in Jetpack.
*/
if (defined('IS_WPCOM') && IS_WPCOM && self::HASHTAGS & $what_to_extract) {
//This regex does not exactly match Twitter's
// if there are problems/complaints we should implement this:
// https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
if (preg_match_all('/(?:^|\\s)#(\\w*\\p{L}+\\w*)/u', $stripped_content, $matches)) {
$hashtags = array_values(array_unique($matches[1]));
//array_unique() retains the keys!
$hashtags = array_map('strtolower', $hashtags);
$extracted['hashtag'] = array('name' => $hashtags);
if (!isset($extracted['has'])) {
$extracted['has'] = array();
}
$extracted['has']['hashtag'] = count($hashtags);
}
}
// ----------------------------------- SHORTCODES ------------------------------
// Always look for shortcodes.
// If we don't want them, we'll just remove them, so we don't grab them as links below
$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
if (preg_match_all($shortcode_pattern, $content, $matches)) {
$shortcode_total_count = 0;
$shortcode_type_counts = array();
$shortcode_types = array();
$shortcode_details = array();
if (self::SHORTCODES & $what_to_extract) {
foreach ($matches[2] as $key => $shortcode) {
//Elasticsearch (and probably other things) doesn't deal well with some chars as key names
$shortcode_name = preg_replace('/[.,*"\'\\/\\\\#+ ]/', '_', $shortcode);
$attr = shortcode_parse_atts($matches[3][$key]);
$shortcode_total_count++;
if (!isset($shortcode_type_counts[$shortcode_name])) {
$shortcode_type_counts[$shortcode_name] = 0;
}
$shortcode_type_counts[$shortcode_name]++;
// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
// @todo Store number of occurrences?
if (!in_array($shortcode_name, $shortcode_types)) {
$shortcode_types[] = $shortcode_name;
}
// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.)
if (in_array($shortcode, self::$KEEPER_SHORTCODES)) {
unset($id);
// Clear shortcode ID data left from the last shortcode
// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id()
// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id()
$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
$shortcode_class_name = ucfirst($shortcode) . 'Shortcode';
$shortcode_get_id_method = "get_{$shortcode}_id";
if (function_exists($shortcode_get_id_func)) {
$id = call_user_func($shortcode_get_id_func, $attr);
} else {
if (method_exists($shortcode_class_name, $shortcode_get_id_method)) {
$id = call_user_func(array($shortcode_class_name, $shortcode_get_id_method), $attr);
}
}
if (!empty($id) && (!isset($shortcode_details[$shortcode_name]) || !in_array($id, $shortcode_details[$shortcode_name]))) {
$shortcode_details[$shortcode_name][] = $id;
}
}
}
if ($shortcode_total_count > 0) {
// Add the shortcode info to the $extracted array
if (!isset($extracted['has'])) {
$extracted['has'] = array();
}
$extracted['has']['shortcode'] = $shortcode_total_count;
$extracted['shortcode'] = array();
foreach ($shortcode_type_counts as $type => $count) {
$extracted['shortcode'][$type] = array('count' => $count);
}
if (!empty($shortcode_types)) {
$extracted['shortcode_types'] = $shortcode_types;
}
foreach ($shortcode_details as $type => $id) {
$extracted['shortcode'][$type]['id'] = $id;
}
}
}
// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
$content = preg_replace($shortcode_pattern, ' ', $content);
}
// ----------------------------------- LINKS ------------------------------
if (self::LINKS & $what_to_extract) {
// To hold the extracted stuff we find
$links = array();
// @todo Get the text inside the links?
// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images
// (we treat embed links as just another link)
if (preg_match_all('#(?:^|\\s|"|\')(https?://([^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/))))#', $content, $matches)) {
foreach ($matches[1] as $link_raw) {
$url = parse_url($link_raw);
// Data URI links
if (isset($url['scheme']) && 'data' === $url['scheme']) {
continue;
}
// Remove large (and likely invalid) links
if (4096 < strlen($link_raw)) {
continue;
}
// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those
$simple_url = $url['scheme'] . '://' . $url['host'] . (!empty($url['path']) ? $url['path'] : '');
if (isset($extracted['image']['url'])) {
if (in_array($simple_url, (array) $extracted['image']['url'])) {
continue;
}
}
list($proto, $link_all_but_proto) = explode('://', $link_raw);
// Build a reversed hostname
$host_parts = array_reverse(explode('.', $url['host']));
$host_reversed = '';
foreach ($host_parts as $part) {
$host_reversed .= (!empty($host_reversed) ? '.' : '') . $part;
}
$link_analyzed = '';
if (!empty($url['path'])) {
// The whole path (no query args or fragments)
$path = substr($url['path'], 1);
// strip the leading '/'
$link_analyzed .= (!empty($link_analyzed) ? ' ' : '') . $path;
// The path split by /
$path_split = explode('/', $path);
if (count($path_split) > 1) {
$link_analyzed .= ' ' . implode(' ', $path_split);
}
// The fragment
if (!empty($url['fragment'])) {
$link_analyzed .= (!empty($link_analyzed) ? ' ' : '') . $url['fragment'];
}
}
// @todo Check unique before adding
$links[] = array('url' => $link_all_but_proto, 'host_reversed' => $host_reversed, 'host' => $url['host']);
}
}
$link_count = count($links);
if ($link_count) {
$extracted['link'] = $links;
if (!isset($extracted['has'])) {
$extracted['has'] = array();
}
$extracted['has']['link'] = $link_count;
}
}
// ----------------------------------- EMBEDS ------------------------------
//Embeds are just individual links on their own line
if (self::EMBEDS & $what_to_extract) {
if (!function_exists('_wp_oembed_get_object')) {
include ABSPATH . WPINC . '/class-oembed.php';
}
// get an oembed object
$oembed = _wp_oembed_get_object();
// Grab any links on their own lines that may be embeds
if (preg_match_all('|^\\s*(https?://[^\\s"]+)\\s*$|im', $content, $matches)) {
// To hold the extracted stuff we find
$embeds = array();
foreach ($matches[1] as $link_raw) {
$url = parse_url($link_raw);
list($proto, $link_all_but_proto) = explode('://', $link_raw);
// Check whether this "link" is really an embed.
foreach ($oembed->providers as $matchmask => $data) {
list($providerurl, $regex) = $data;
// Turn the asterisk-type provider URLs into regex
if (!$regex) {
$matchmask = '#' . str_replace('___wildcard___', '(.+)', preg_quote(str_replace('*', '___wildcard___', $matchmask), '#')) . '#i';
$matchmask = preg_replace('|^#http\\\\://|', '#https?\\://', $matchmask);
}
if (preg_match($matchmask, $link_raw)) {
$provider = str_replace('{format}', 'json', $providerurl);
// JSON is easier to deal with than XML
$embeds[] = $link_all_but_proto;
// @todo Check unique before adding
// @todo Try to get ID's for the ones we care about (shortcode_keepers)
break;
}
}
}
if (!empty($embeds)) {
if (!isset($extracted['has'])) {
$extracted['has'] = array();
}
$extracted['has']['embed'] = count($embeds);
$extracted['embed'] = array('url' => array());
foreach ($embeds as $e) {
$extracted['embed']['url'][] = $e;
}
}
}
}
return $extracted;
}