Suche Backendcoder
11.04.2015 um 00:15
Naja. Kann gerne mal ein Beispiel posten. Die aktuelle Postparserklasse.
Extrahiert Links, Bilder, Videos, Hashtags und Useradds ( Useradds bisher ohne sie in die DB einzutragen ) und erzeugt ein BBcode für Oneboxing sofern die Links in der Whitelist sind. Der gegebene Text wird rückwärts geparsed.
Die Links werden später bei der Ausgabe natürlich nochmal kontrolliert. Keine Data-uris und andere XSS Dinger.
Wer das Skript nutzen möchte... Bitte sehr. Aber ohne Garantie. Ich müsste es nochmal überfliegen udn werd das auch tun wenn die Useraddfunktion endlich umgesetzt wird.
PS: nonono === 'secret' und die neuen webm Videos müssen auch noch eingefügt werden.
<?php
/**
* @category PHP
* @package nonono
* @author nonono
* @copyright (c) 2013 nonono
* @license MIT
* @version 1.01
* [at_user]link[/at_user] http://nonono.de/en/
* @see ''
*
* subjecttypeids { 1:topics & posts, 2:channels }
*
*
*/
namespace nonono\version1;
class linkparser {
const _MAX_ROUNDS_ = 20;
const _MAX_OGP_LINKS_ = 10;
const _CACHE_TIME_ = 2419200; // we cache the fetched link data / opengraphdata etc. for some time -> value in seconds
const _MAX_NUM_BANGS_ = 9; // on the first round count( our bangs ) returns 0, because we check before we add. Therefore if we want to allow 5 we use 4
const _MIN_LENGTH_BANG_ = 3;
const _MAX_LENGTH_BANG_ = 30;
private $_db;
private $_redis;
private $_text;
private $_channelId;
private $_subjectId;
private $_subjectTypeId;
private $_subjectPrefix = false;
private $_cleancache;
private $_countOGP = 0;
private $_writeToDB = false;
// this is our root target id
// if we find imagelink inside a post
// we add them to the topicid which would be in this case the root id
private $_rootId;
private $_embedkey = 'nonono';
private $_skipper = array( '#', '@', ']' );
private $_punctuation = array( ',', '.', '?', '!', ':', ' ' );
private $_validPrefixes = array( ',','.','?','!',':','-','_',"\r",'~',' ' );
private $_extractions = array( 'opg' => array(), 'bangs' => array(), 'useradds' => array() );
private $_settings = array();
private $_features;
private $_jsonifiedTags = false;
private $_bbcodes = array(
'ogp' => array( '[ogp]', '[/ogp]' ), // valid linksources for oneboxing ( loading thumbnail,title,text using opengraph )
'url' => array( '[url]', '' ), // links we don't onebox for security reasons -> links to sites not included in our valid links list
'img' => array( '[objerror]Bild nicht mehr verfügbar[/objerror]' ), // normal imagelinks
);
private $_images = array( 'jpg', 'jpeg', 'gif', 'png' );
private $_extractedLinks = array();
public function __construct( &$rdb, &$mysql = false, $cleancache = false ){
$this->_redis = $rdb;
$this->_db = $mysql;
$this->_cleancache = $cleancache; // if set to true we delete the cached tags in redis
}
public function prepareText( &$features ){
$this->_features = $features;
$removeBBcodes = array();
if( $this->_features['links'] ){
$this->_settings[] = array( '://', 1 );
$removeBBcodes[] = '[ogp]';
$removeBBcodes[] = '[/ogp]';
$removeBBcodes[] = '[url]';
$removeBBcodes[] = '';
}
if( $this->_features['hashtags'] ){
$this->_settings[] = array( '#', 2 );
$removeBBcodes[] = '[bang]';
$removeBBcodes[] = '[/bang]';
}
if( $this->_features['useradds'] ){
$this->_settings[] = array( '@', 3 );
$removeBBcodes[] = '[user]';
$removeBBcodes[] = '[/user]';
}
// many people use line breaks as delimiter for images
// to get these links without problem
// add a whitespace in front of any linebreak
// this won't show up in the end but helps us to get everything we need
// remove duplicate whitespaces
// this while loop is way faster than preg_replace
while( strpos($this->_text, ' ') !== false ){ $this->_text = str_replace(' ', ' ', $this->_text); }
$this->_text = str_replace( $removeBBcodes, '', $this->_text );
$this->_text = str_replace( array( "\r\n", "\r", "\n" ), " \r", $this->_text );
return $this;
}
public function extractAndReplace(){
// initial
$settings = array_pop( $this->_settings );
$i = 0;
$stop = false;
// we use strrpos() and search from the end for some reasons
// this avoids some unwanted bugs like nested links http://link.de https://link.de
// we update the text length if we change something.
// This is needed! We trust in a permanently correct value. No exceptions.
// we could use strlen() but that would call this function every time
// and slows us down
$text_length = strlen( $this->_text )-1;
$offset = -1;
while ( false === $stop ){
$add = 0;
$offset = min( max( $offset, 1 ), strlen( $this->_text ) ) * -1;
// check if there is one || a next link
$start_pos = strripos( $this->_text, $settings[0], $offset );
// if there are no matches we can stop the loop
// if the total number of rounds is reached we exit too
$limit_reached = $i >= self::_MAX_ROUNDS_ ? true : false;
if( false === $start_pos || $limit_reached ){
$settings = array_pop( $this->_settings );
if( $limit_reached || $settings === NULL ){
$stop = true;
break;
}
else {
// reset start, end
$offset = -1;
$start_pos = strripos( $this->_text, $settings[0], $offset );
}
}
if( false !== $start_pos ){
// let's see if there is something after the prefix
// we skip this run if nothing is there like "hi, i am @ localhost"
/* check again array() with linebreaks too */
$off = $start_pos + strlen( $settings[0] );
// to prevent an uninitialized offset warning we check if the offset is smaller or equal to our textlenght
// remember that strlen returns the true length therefore we don't want to check if the offset equals the length like <=
// since we have a null index we only check if the offset is smaller like <
if( $off < strlen( $this->_text ) && $this->_text[ $off ] !== ' ' ) {
// now we need to know if this is a link or a hashbang,useradd ...
switch ( $settings[1] ) {
case 1: // this is a link
$add = $this->extractLink( $start_pos );
break;
case 2: // this is a hashbang
$add = $this->extractBangs( $start_pos, $settings[0], array( '[bang]', '[/bang]' ), 'bangs' );
break;
case 3: // this is a useradd
$add = $this->extractBangs( $start_pos, $settings[0], array( '[user]', '[/user]' ), 'useradds' );
break;
default:
# code...
break;
}
}
$start_pos = $start_pos-$add;
}
else {
$start_pos = -1;
}
$offset = strlen( $this->_text ) - $start_pos + 1;
$i++;
}
return array( 'txt' => $this->_text, 'bb' => strpos( $this->_text, '[' ) === false ? 0 : 1 );
}
private function extractBangs( &$start_pos, &$prefix, $bbcode, $arr_pos ){
// get the end position of this bang
$end_pos = $this->getEndPos( $start_pos );
// we skip if the bangs are preffixed with one of our skipper characters
// this prevents that link anchors are handled as bangs etc.
$offset = $start_pos - 1; // check skipperprefixes
$offset = $offset < 0 ? 0 : $offset;
/* check if it is valid */
if( $offset <= 0 || in_array( $this->_text[ $offset ], $this->_validPrefixes ) ){
if ( $offset <= 0 || ! in_array( $this->_text[ $offset ], $this->_skipper ) ){
// check if the bang is followed by some punctuations
while( in_array( $this->_text[ $end_pos ], $this->_punctuation ) ){
// found some punctuations and change the endposition
// doesn't matter how many punctuation chars the user places after the link
// we loop them away until there is no more punctuation
$end_pos--;
}
// extract the full bang
$bang_length = $end_pos-$start_pos+1;
$bang_full = substr( $this->_text, $start_pos, $bang_length );
return $this->scanBang( $bang_full, $start_pos, $bang_length, $bbcode, $arr_pos );
}
return 0;
}
return 0;
}
private function scanBang( $bang_full, $start_pos, $bang_length, $bbcode, $arr_pos ){
$clean_bang = strip_tags( $bang_full );
$clean_bang = ltrim ( $clean_bang, '#@' );
$clean_bang_len = strlen( $clean_bang );
// if we accept this bang we add it to our array
if(
count( $this->_extractions[ $arr_pos ] ) <= self::_MAX_NUM_BANGS_ &&
$clean_bang_len >= self::_MIN_LENGTH_BANG_ &&
$clean_bang_len <= self::_MAX_LENGTH_BANG_
){
// finally add the bang to our array
$this->_extractions[ $arr_pos ][] = $clean_bang;
// create the replacement
$bang_bb_coded = ' ' . $bbcode[0].$bang_full.$bbcode[1];
// replace the bang
$this->_text = substr_replace(
$this->_text,
$bang_bb_coded,
$start_pos,
$bang_length
);
}
return 1;
}
private function getEndPos( $start_pos ){
// get the end position of this subject
// if there is none the subject ends at the end of the text
// so if we get a false here we use the strlen as end
$end_pos = stripos( $this->_text, ' ', $start_pos );
if( false === $end_pos ){
$end_pos = strlen( $this->_text )-1; // we have a 0 index and need to substract 1 to get the real position
}
return $end_pos;
}
private function extractLink( &$start_pos ){
// we work with :// to get links
// so we have to do some changes
// on the startposition etc
$protocol = strtolower( substr( $this->_text, $start_pos-4, 5 ) );
switch ( $protocol ) {
case 'http:':
$start_pos = $start_pos-4;
break;
case 'ttps:':
$start_pos = $start_pos-5;
break;
default:
return 0;
break;
}
$end_pos = $this->getEndPos( $start_pos );
// we skip if the links are preffixed with one of our skipper characters
// this prevents that hashtags like #http://
// are handled as links
$offset = $start_pos - 1; // check skipperprefixes
$offset = $offset < 0 ? 0 : $offset;
/* check if it is a valid http:// or https:// */
if( $offset <= 0 || in_array( $this->_text[ $offset ], $this->_validPrefixes ) ){
if ( ! in_array( $this->_text[ $offset ], $this->_skipper ) ){
// check if the link is followed by some punctuations
while( in_array( $this->_text[ $end_pos ], $this->_punctuation ) ){
// found some punctuations and change the endposition
// doesn't matter how many punctuation chars the user places after the link
// we loop them away until there is no more punctuation
$end_pos--;
}
// extract the full link
$link_length = $end_pos-$start_pos+1;
$link_path = substr( $this->_text, $start_pos, $link_length );
return $this->scanLinks( $link_path, $start_pos, $link_length );
}
return 0;
}
return 0;
}
private function scanLinks( $link_path, $start_pos, $link_length ){
if( empty( $link_path ) ){ return 0; }
$uri = parse_url( $link_path );
// extract the domain name and subdomain
$domain_parts = $this->get_subdomains( $uri['host'] );
$uri_n = array_merge( $uri, $domain_parts );
$domain = strtolower( $domain_parts['domain'] );
$valid = false;
if( ! isset( $uri['host'] ) || empty( $uri['host'] ) ){ return 0; }
// now we need to find out if it is an image link
// we would run into problems if the link is in fact
// an image link but contains some parameters
// image.jpg?width=50&height=50
// another problem would be if there is no extension
// some stupid guys use a schema like /img/image_id
// or img/imagename/jpg
// we don't care and the only thing we do is to remove all possible get params
if( ! isset( $uri['path'] ) || empty( $uri['path'] ) ){
$uri['path'] = '';
}
// now we want the extension of the file
$extension = end( explode( '.', $uri['path'] ) );
// and check if it points to an imagetype we support
if ( in_array( strtolower( $extension ), $this->_images ) ) {
// it is an image link we support,
// our javascript image loader doesnt load
// images that don't end with a valid extension
// for now we remove all params
// and see what happens in future
// if there are params we remove them
// maybe we change this and keep them
if( isset( $uri['query'] ) && ! empty( $uri['query'] ) ){
// and replace the link path with the new one
$link_path = $uri['scheme'] . '://' . $uri['host'] . '/' . $uri['path'];
}
$image_paths = $this->createThumbnail( $link_path, $extension );
$this->addExtract( $link_path, 1, $image_paths['thumbnail_path'], $domain );
if( $image_paths ){
$valid = array( '[img='.$image_paths['thumbnail_path'].']', '[/img]' );
$link_path = $image_paths['full_image_path'];
}
else {
$valid = $this->_bbcodes['img'];
}
}
else{
// let's see if the link is in our whitelist
if ( $this->_redis->sIsMember( 'nonono', $domain ) ) {
// this domain is whitelisted, we can 'safely' ( with care.. )
// use it for oneboxing
if( $this->_countOGP <= self::_MAX_OGP_LINKS_ ){
// we just add links to the extraction array
// if we don't have something in our cache to
// come up with
if( ! $this->dataExists( $link_path ) ){
$this->_extractions['ogp'][] = $link_path;
}
$this->addExtract( $link_path, 2, false, $domain );
$valid = $this->_bbcodes['ogp'];
$this->_countOGP++;
}
else {
// a whitelisted link, but nothing usable found for oneboxing
// or maximum number of oneboxers exceeded
$this->addExtract( $link_path, 2, false, $domain );
$valid = $this->_bbcodes['url'];
}
}
// a random link not whitelistes
else {
// just a normal link that is not whitelisted for oneboxing
$this->addExtract( $link_path, 2, false, $domain );
$valid = $this->_bbcodes['url'];
}
}
// if we have a valid link we replace it with our bbcoded link
if( false !== $valid ){
$this->updateLinkCounter( $link_path );
$link_bb_coded = ' ' . $valid[0].$link_path.$valid[1];
$this->_text = substr_replace(
$this->_text,
$link_bb_coded,
$start_pos,
$link_length
);
return 1;
}
return 0;
}
private function addExtract( $link, $type, $meta, $domain ){
// if databaseinsertion is enabled
// we add this link to our extract array
// and perform a bulk insertion to our database
// after everything is done
// note: 1 == images, 2 == normal link
if( $this->_writeToDB ){
$this->_extractedLinks[ $link ] = array( $type, $meta, $domain );
}
return;
}
private function dataExists( $link_path ){
try{ $r = $this->_redis->exists( 'ogp:'.$link_path ); }
catch (\RedisException $e) { return false; }
return $r;
}
private function updateLinkCounter( $link_path ){
try{ $this->_redis->zIncrBy( 'ogp:links', 1, $link_path ); }
catch (\RedisException $e) { return false; }
}
public function addTagsToDB( $json = false ){
if( empty( $this->_extractions['bangs'] ) ){
return $this;
}
$cached_tag_ids = array();
$tags = array_unique( $this->_extractions['bangs'] );
try {
// prepare some queries
$addTag = $this->_db->prepare('INSERT IGNORE INTO tags VALUES ( ?, ? )');
$getTagId = $this->_db->prepare('SELECT tag_id FROM tags WHERE tag_name = ?');
$addToTagMap = $this->_db->prepare('INSERT IGNORE INTO tagmap VALUES ( ?, ?, ?, ?, ? )');
$this->_db->beginTransaction();
foreach ( $tags as &$tagname ) {
$tagname = strtolower( $tagname );
// add new tag to tag table
$addTag->execute( array( null, $tagname ) );
// get the number of rwos affected
// if this tag already exists we get 0 affected rows
// and need to get the tagid otherwise
$count = ( int ) $addTag->rowCount();
if( $count === 1 ){
// if we get 1 affected row we use the last insert id as tag id
$tagID = $this->_db->lastInsertId();
}
else {
// this situtation will never happen but we are prepared for it
// we cache duplicate tag ids, as said won't happen but why not...
if( isset( $cached_tag_ids[$tagname] ) ){
$tagID = $cached_tag_ids[$tagname];
}
else {
// if the tag id is not cached and the tag already exists in our db
// we query the tag id
$getTagId->execute( array( $tagname ) );
$r = $getTagId->fetch(\PDO::FETCH_ASSOC);
$tagID = $r['tag_id'];
}
}
// after we have the tag id we cache it, see above: not reellay needed because we use array_unique
// but in case we need in the future... Just do it
$cached_tag_ids[$tagname] = $tagID;
// last step: add the tagdata to our tagmap
// the tagmaptable is shared betwen everything we can tag
// depending on what we tag we just have a different TagSubjectID
$addToTagMap->execute( array( null, $this->_subjectTypeId, $this->_subjectId, $tagID, time() ) );
// if the tag is already associated to this subject we add the tag to our insertes tag array
// wich we send back formmatted
$count = ( int ) $addToTagMap->rowCount();
if( $count === 1 ){
try{
$this->_jsonifiedTags[] = array(
'tag_id' => $tagID,
'tag_name' => htmlspecialchars( $tagname, ENT_QUOTES, 'UTF-8' ),
'tag_score' => ( int ) $this->_redis->zIncrBy( 'tagScores', 1, $tagID )
);
} catch (\RedisException $e) { defaultJsonExit( false ); }
}
}
// done
$this->_db->commit();
} catch (\PDOException $p) { defaultJsonExit( false ); }
// cleaning our cached tags in redis for the specific subject
if( $this->_subjectPrefix && $this->_cleancache ){
try {
$this->_redis->del( implode( ':', array( $this->_subjectPrefix, $this->_subjectId, 'tags' ) ) );
} catch (\RedisException $e) { defaultJsonExit( false ); }
}
// if everythings is done send some data to the client
// containing the html for the new tags
return $this;
}
public function addLinksToDB(){
if( empty( $this->_extractedLinks ) ){
return $this;
}
try {
// prepare some queries
$addlink = $this->_db->prepare('INSERT IGNORE INTO links VALUES ( ?, ?, ?, ?)');
$getlinkid = $this->_db->prepare('SELECT link_id FROM links WHERE link = ?');
// images are stored separate
#'INSERT IGNORE INTO `image_map_posts`(`id`, `post_id`, `topic_id`, `channel_id`, `user_id`, `album_id`, `post_date`) VALUES (?,?,?,?,?,?,?)';
$addtoimgmap = $this->_db->prepare('INSERT IGNORE INTO image_map_posts VALUES (?,?,?,?,?,?,?)');
$addtolinkmap = $this->_db->prepare('INSERT IGNORE INTO linkmap VALUES ( ?, ?, ?, ?, ?, ?, ? )');
$this->_db->beginTransaction();
foreach ( $this->_extractedLinks as $link => &$type ) {
// add new link to link table
$addlink->execute( array( null, $link, $type[2], $type[1] ) );
// get the number of rows affected
// if this link already exists we get 0 affected rows
// and need to get the tagid otherwise
$count = ( int ) $addlink->rowCount();
if( $count === 1 ){
// if we get 1 affected row we use the last insert id as link id
$linkID = $this->_db->lastInsertId();
}
else {
// if the link already exists in our db
// we query the link id
$getlinkid->execute( array( $link ) );
$r = $getlinkid->fetch(\PDO::FETCH_ASSOC);
$linkID = $r['link_id'];
}
// last step: add the linkdata to our linkmap
// the linkmaptable is shared betwen everything we can link
// depending on what we link we just have a different TagSubjectID
// if type === 1 this is an image link, if type === 2 it is a normal link
switch ($type[0]) {
case 1:
$addtoimgmap->execute( array( null, $this->_subjectId, $this->_rootId, $this->_channelId, _UID_, $linkID, time() ) );
break;
case 2:
$addtolinkmap->execute( array( null, $this->_subjectTypeId, $this->_subjectId, $linkID, $this->_rootId, $this->_channelId, _UID_ ) );
break;
default:
$addtolinkmap->execute( array( null, $this->_subjectTypeId, $this->_subjectId, $linkID, $this->_rootId, $this->_channelId, _UID_ ) );
break;
}
// if the link is already associated to this subject we add the link to our inserted link array
// wich we send back formmatted
$count = ( int ) $addtolinkmap->rowCount();
}
// done
$this->_db->commit();
} catch (\PDOException $p) { defaultJsonExit( false ); }
// done
return $this;
}
/****
*
* download images and create thumbnails
*
****/
private function createThumbnail( $link_path, $extension ){
$thumbnail_folder = 'nonono';
$full_image_path = 'nonono';
$new_file_name = $this->createNewFileName( $extension );
$new_full_file_path = $full_image_path . '/' . $new_file_name;
$thumbnail_path = $thumbnail_folder . '/' . $new_file_name;
$this->folderExists( $thumbnail_folder );
$image_raw = $this->fetch_image( $link_path, $new_full_file_path );
if( $image_raw ){
$path_on_succes = $this->makeThumbnail( $image_raw, $thumbnail_path, $extension );
if( $path_on_succes ){
return array( 'thumbnail_path' => substr( $path_on_succes, 13 ), 'full_image_path' => substr( $new_full_file_path, 13 ) );
}
}
return false;
}
private function makeThumbnail( $file, $path, $extension ){
$img = @imagecreatefromstring( $file );
if ( false !== $img ) {
$old_width = imagesx( $img );
$old_height = imagesy( $img );
// calculate thumbnail size
$new_width = 200 > $old_width ? $old_width : 200;
$new_height = floor( $old_height * ( $new_width / $old_width ) );
// create a new temporary image
$tmp_img = imagecreatetruecolor( $new_width, $new_height );
// copy and resize old image into new image
imagecopyresized( $tmp_img, $img, 0, 0, 0, 0, $new_width, $new_height, $old_width, $old_height );
switch ( strtolower( $extension ) ) {
case 'jpg':
imagejpeg( $tmp_img, $path, 100 );
break;
case 'jpeg':
imagejpeg( $tmp_img, $path, 100 );
break;
case 'png':
imagepng( $tmp_img, $path, 0 );
break;
case 'gif':
imagegif( $tmp_img, $path );
break;
default:
$img = false;
break;
}
return $path;
}
return false;
}
private function fetch_image( $link_path, $new_full_file_path ){
$ch = curl_init( $link_path );
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 1000);
curl_setopt($ch, CURLOPT_USERAGENT, 'nonono');
$image_raw = curl_exec( $ch );
curl_close( $ch );
$fp = fopen($new_full_file_path,'w');
fwrite($fp, $image_raw);
fclose($fp);
return $image_raw;
}
private function folderExists( $path ){
return is_dir( $path ) ? true : mkdir( $path, 0755, true ) ? true : false;
}
private function createNewFileName( $extension ){
$tmp[] = uniqid( 'mindcluster_' );
$tmp[] = '_';
$tmp[] = rand();
$tmp[] = '.';
$tmp[] = $extension;
return implode( $tmp );
}
private function get_domain($domain){
if( preg_match("/(?P<domain>[a-z0-9][a-z0-9\-]{1,63}\.[a-z\.]{2,6})$/i", $domain, $r) ){
return $r['domain'];
}
return $domain;
}
private function get_subdomains($domain){
$sub = $domain;
$domain = $this->get_domain($sub);
$sub = rtrim( strstr( $sub, $domain, true ), '.' );
return array( 'sub' => $sub, 'domain' => $domain );
}
public function setScannerText(&$in) {
$this->_text = $in;
return $this;
}
public function setSubjectId($in) {
$this->_subjectId = $in;
return $this;
}
public function setSubjectPrefix() {
// this is only needed for cached tags
// if we don't cache tags we don't need it
switch ( $this->_subjectTypeId ) {
case 1:
$this->_subjectPrefix = 't'; // topics
break;
case 2:
$this->_subjectPrefix = 'c'; // channels
break;
default:
defaultJsonExit(false);
break;
}
return $this;
}
public function setRootId( $in ){
$this->_rootId = $in;
return $this;
}
public function setChannelId($in){
$this->_channelId = $in;
return $this;
}
public function setToDB( $in ){
$this->_writeToDB = $in;
return $this;
}
public function setSubjectTypeId($in) {
$this->_subjectTypeId = $in;
return $this;
}
public function getJsonTags(){
return $this->_jsonifiedTags;
}
}
?>