File Manager

Viewing File: /home/ubuntu/code_review/arcanist/src/parser/html/PhutilHTMLParser.php

<?php

final class PhutilHTMLParser extends Phobject {

  private $cursor;

  public function parseDocument($corpus) {
    // Divide the block into sequences of "tag" and "non-tag" content. Tag
    // content is anything between angle brackets ("<" and ">"). Non-tag
    // content is anything else.

    $segment_pos = 0;
    $segments = array();
    $in_tag = false;

    for ($ii = 0; $ii < strlen($corpus); $ii++) {
      $c = $corpus[$ii];

      if ($in_tag && ($c === '>')) {
        if ($segment_pos !== null) {
          $segments[] = array(
            'tag' => $in_tag,
            'pos' => $segment_pos,
            'end' => $ii + 1,
          );
        }

        $segment_pos = $ii + 1;
        $in_tag = false;
        continue;
      }

      // When we encounter a "<", we start a new tag whether we're already in
      // a tag or not. We want to parse "<x>1 < 2</x>" as a single tag with
      // the content "1 < 2".

      if ($c === '<') {
        $segments[] = array(
          'tag' => false,
          'pos' => $segment_pos,
          'end' => $ii,
        );

        $segment_pos = $ii;
        $in_tag = true;
        continue;
      }
    }

    // Add whatever content was left at the end of the string. If we were in
    // a tag but did not find a closing ">", we treat this as normal content.
    $segments[] = array(
      'tag' => false,
      'pos' => $segment_pos,
      'end' => $ii,
    );

    // Slice the marked segments out of the raw corpus so we get a list of
    // "tag" strings and a list of "non-tag" strings.

    $parts = array();
    $corpus_length = strlen($corpus);
    foreach ($segments as $segment) {
      $tag = $segment['tag'];
      $pos = $segment['pos'];
      $len = $segment['end'] - $pos;

      // If this is a tag, we'll drop the "<" at the beginning and the ">"
      // at the end here.
      if ($tag) {
        $slice_pos = $pos + 1;
        $slice_len = $len - 2;
      } else {
        $slice_pos = $pos;
        $slice_len = $len;
      }

      if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
        $content = substr($corpus, $slice_pos, $slice_len);
      } else {
        $content = '';
      }

      $parts[] = array(
        'tag' => $tag,
        'pos' => $pos,
        'len' => $len,
        'content' => $content,
      );
    }

    $root = new PhutilDOMNode();
    $this->setCursor($root);

    foreach ($parts as $part) {
      $tag = $this->newTagDOMNode($part);

      if ($tag !== null) {
        continue;
      }

      $content = $part['content'];

      // If this part is a tag, restore the angle brackets.
      if ($part['tag']) {
        $content = '<'.$content.'>';
      }

      $node = id(new PhutilDOMNode())
        ->setContent($content)
        ->setRawHead($content);

      $this->getCursor()->appendChild($node);
    }

    $root->mergeContent();

    return $root;
  }

  private function newTagDOMNode(array $part) {
    if (!$part['tag']) {
      return null;
    }

    $raw_content = $part['content'];
    $content = $raw_content;

    $content = trim($content);
    $content_len = strlen($content);

    // If the tag content begins with "/", like "</td>", strip the slash
    // off and mark this as a closing tag.
    $is_close = false;
    if ($content_len > 0 && $content[0] === '/') {
      $is_close = true;
      $content = substr($content, 1);
      $content = trim($content);
      $content_len = strlen($content);
    }

    // If the tag content ends with "/", like "<td />", strip the slash off
    // and mark this as self-closing.
    $self_close = false;
    if ($content_len > 0 && $content[$content_len - 1] === '/') {
      $self_close = true;
      $content = substr($content, 0, $content_len - 1);
      $content = trim($content);
      $content_len = strlen($content);
    }

    // If this tag is both a closing tag and a self-closing tag, it is
    // not formatted correctly. Treat it as content.
    if ($self_close && $is_close) {
      return null;
    }

    // Now, split the rest of the tag into the tag name and tag attributes.
    $pieces = preg_split('/\s+/', $content, 2);
    $tag_name = $pieces[0];

    if (count($pieces) > 1) {
      $attributes = $pieces[1];
    } else {
      $attributes = '';
    }

    // If there's no tag name, this tag is not valid. Treat it as content.
    if (!strlen($tag_name)) {
      return null;
    }

    // If this is a closing tag with attributes, it's not valid. Treat it
    // as content.
    if ($is_close && strlen($attributes)) {
      return null;
    }

    $tag_name = phutil_utf8_strtolower($tag_name);

    // If we find a valid closing tag, try to find a matching tag on the stack.
    // If we find a matching tag, close it.
    // If we do not find a matching tag, treat the closing tag as content.
    if ($is_close) {
      $cursor = $this->getCursor();

      while ($cursor) {
        if ($cursor->getTagName() === $tag_name) {
          // Add this raw content to the raw content of the tag we're closing.
          $cursor->setRawTail('<'.$raw_content.'>');

          $parent = $cursor->getParentNode();
          $this->setCursor($parent);

          return true;
        }
        $cursor = $cursor->getParentNode();
      }

      return null;
    }

    if (strlen($attributes)) {
      $attribute_map = $this->parseAttributes($attributes);
      // If the attributes can't be parsed, treat the tag as content.
      if ($attribute_map === null) {
        return null;
      }
    } else {
      $attribute_map = array();
    }

    $node = id(new PhutilDOMNode())
      ->setTagName($tag_name)
      ->setAttributes($attribute_map)
      ->setRawHead('<'.$raw_content.'>');

    $cursor = $this->getCursor();
    $cursor->appendChild($node);

    if (!$self_close) {
      $this->setCursor($node);
    }

    return $node;
  }

  private function setCursor(PhutilDOMNode $cursor) {
    $this->cursor = $cursor;
    return $this;
  }

  private function getCursor() {
    return $this->cursor;
  }

  private function parseAttributes($attributes) {
    $state = 'key';

    $whitespace = array(
      ' ' => true,
      "\n" => true,
      "\t" => true,
      "\r" => true,
    );

    $map = array();
    $len = strlen($attributes);
    $key_pos = null;
    for ($ii = 0; $ii < $len; $ii++) {
      $c = $attributes[$ii];
      $is_space = isset($whitespace[$c]);

      switch ($state) {
        case 'key':
          // We're looking for the start of an attribute name.

          // Skip over any whitespace.
          if ($is_space) {
            break;
          }

          // If we see "<tag =...", that isn't valid. Treat this tag as
          // content.
          if ($c === '=') {
            return null;
          }

          // If we see a quotation mark with no attribute name, that isn't
          // valid. Treat this tag as content.
          if ($c === '"') {
            return null;
          }

          // Any other character marks the beginning of an attribute name.
          // Switch the parser state to "name" to parse the name.
          $name_pos = $ii;
          $state = 'name';
          break;
        case 'name':
          // We're looking for the end of an attribute name.

          // Finding a "=" or a space character ends the attribute name.
          // Save it, then figure out what to do with the parser state.
          if ($c === '=' || $is_space) {
            $name_value = substr($attributes, $name_pos, $ii - $name_pos);
            $name_value = phutil_utf8_strtolower($name_value);

            // If this attribute already exists, the tag is invalid. This means
            // the input is something like "<tag a=1 a=2>".
            if (isset($map[$name_value])) {
              return null;
            }
          }

          // If we find an "=", that's the end of the name. Next, we're going
          // to parse a value.
          if ($c === '=') {
            $state = 'value';
            break;
          }

          // If we find whitespace, that's the end of the name. We're going
          // to look for an "=".
          if ($is_space) {
            $state = 'equals';
            break;
          }

          break;
        case 'equals':
          // We've parsed the name of an attribute and are looking for an
          // "=" character.

          // Skip over any whitespace.
          if ($is_space) {
            break;
          }

          // This is the "=" we're looking for, so we're good to go.
          if ($c === '=') {
            $state = 'value';
            break;
          }

          // If this is anything else, this is an attribute name with no
          // value. Treat it as "true" and move on. This corresponds to an
          // input like "<input disabled>".
          $map[$name_value] = true;
          $name_pos = $ii;
          $state = 'name';
          break;
        case 'value':
          // We've parsed an "=" and are looking for the start of a value.

          // Skip over any whitespace.
          if ($is_space) {
            break;
          }

          // Don't accept "<tag a==" to mean that key "a" has a value of
          // "=", since this is silly. To specify a value beginning with "=",
          // you have to quote it.
          if ($c === '=') {
            return null;
          }

          // Anything else is a value.
          $value_pos = $ii;

          // This is a quotation mark, so parse a quoted value.
          if ($c === '"') {
            $value_pos = $value_pos + 1;
            $state = 'quoted';
          } else {
            $state = 'unquoted';
          }
          break;
        case 'quoted':
          // We've started parsing a quoted value, so look for the closing
          // quote.

          // We found the closing quote, so pull out the actual value.
          if ($c === '"') {
            $attr_value = substr($attributes, $value_pos, $ii - $value_pos);

            $map[$name_value] = $attr_value;
            $state = 'key';
            break;
          }

          // Anything else is more text in the quoted value.
          break;
        case 'unquoted':
          // We've started parsing an unquoted value, so look for terminating
          // whitespace.

          // We've found some whitespace, so pull out the actual value.
          if ($is_space) {
            $attr_value = substr($attributes, $value_pos, $ii - $value_pos);

            $map[$name_value] = $attr_value;
            $state = 'key';
            break;
          }

          // Anything else is more text in the unquoted value.
          break;
      }
    }

    switch ($state) {
      case 'key':
        // We were looking for the start of an attribute name, so there's
        // nothing to clean up.
        break;
      case 'name':
        // We were looking for the end of an attribute name. Treat whatever
        // we found as a name.
        $name_value = substr($attributes, $name_pos, $len - $name_pos);

        if (isset($map[$name_value])) {
          return null;
        }

        $map[$name_value] = true;
        break;
      case 'equals':
      case 'value':
        // We found an attribute name followed by whitespace or an "=". Treat
        // whatever we found as a valid attribute name with no value.

        if (isset($map[$name_value])) {
          return null;
        }

        $map[$name_value] = true;
        break;
      case 'quoted':
      case 'unquoted':
        // We were parsing a value but ran out of characters before we found
        // the delimiter or closing quote. Treat whatever we found as a quoted
        // value.

        $attr_value = substr($attributes, $value_pos, $len - $name_pos);

        $map[$name_value] = $attr_value;
        break;
    }

    return $map;
  }

}
Back to Directory File Manager