uast.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. package hercules
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "io"
  8. goioutil "io/ioutil"
  9. "os"
  10. "runtime"
  11. "strings"
  12. "sync"
  13. "time"
  14. "github.com/gogo/protobuf/proto"
  15. "github.com/jeffail/tunny"
  16. "gopkg.in/bblfsh/client-go.v2"
  17. "gopkg.in/bblfsh/sdk.v1/protocol"
  18. "gopkg.in/bblfsh/sdk.v1/uast"
  19. "gopkg.in/src-d/enry.v1"
  20. "gopkg.in/src-d/go-git.v4"
  21. "gopkg.in/src-d/go-git.v4/plumbing"
  22. "gopkg.in/src-d/go-git.v4/plumbing/object"
  23. "gopkg.in/src-d/go-git.v4/utils/ioutil"
  24. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  25. "gopkg.in/src-d/hercules.v3/pb"
  26. )
  27. type UASTExtractor struct {
  28. Endpoint string
  29. Context func() (context.Context, context.CancelFunc)
  30. PoolSize int
  31. Languages map[string]bool
  32. FailOnErrors bool
  33. ProcessedFiles map[string]int
  34. clients []*bblfsh.Client
  35. pool *tunny.WorkPool
  36. }
  37. const (
  38. UAST_EXTRACTION_SKIPPED = -(1 << 31)
  39. ConfigUASTEndpoint = "ConfigUASTEndpoint"
  40. ConfigUASTTimeout = "ConfigUASTTimeout"
  41. ConfigUASTPoolSize = "ConfigUASTPoolSize"
  42. ConfigUASTFailOnErrors = "ConfigUASTFailOnErrors"
  43. ConfigUASTLanguages = "ConfigUASTLanguages"
  44. )
  45. type uastTask struct {
  46. Client *bblfsh.Client
  47. Lock *sync.RWMutex
  48. Dest map[plumbing.Hash]*uast.Node
  49. File *object.File
  50. Errors *[]error
  51. Status chan int
  52. }
  53. type worker struct {
  54. Client *bblfsh.Client
  55. Job func(interface{}) interface{}
  56. }
  57. func (w worker) TunnyReady() bool {
  58. return true
  59. }
  60. func (w worker) TunnyJob(data interface{}) interface{} {
  61. task := data.(uastTask)
  62. task.Client = w.Client
  63. return w.Job(task)
  64. }
  65. func (exr *UASTExtractor) Name() string {
  66. return "UAST"
  67. }
  68. func (exr *UASTExtractor) Provides() []string {
  69. arr := [...]string{"uasts"}
  70. return arr[:]
  71. }
  72. func (exr *UASTExtractor) Requires() []string {
  73. arr := [...]string{"changes", "blob_cache"}
  74. return arr[:]
  75. }
  76. func (exr *UASTExtractor) Features() []string {
  77. arr := [...]string{"uast"}
  78. return arr[:]
  79. }
  80. func (exr *UASTExtractor) ListConfigurationOptions() []ConfigurationOption {
  81. options := [...]ConfigurationOption{{
  82. Name: ConfigUASTEndpoint,
  83. Description: "How many days there are in a single band.",
  84. Flag: "bblfsh",
  85. Type: StringConfigurationOption,
  86. Default: "0.0.0.0:9432"}, {
  87. Name: ConfigUASTTimeout,
  88. Description: "Babelfish's server timeout in seconds.",
  89. Flag: "bblfsh-timeout",
  90. Type: IntConfigurationOption,
  91. Default: 20}, {
  92. Name: ConfigUASTPoolSize,
  93. Description: "Number of goroutines to extract UASTs.",
  94. Flag: "bblfsh-pool-size",
  95. Type: IntConfigurationOption,
  96. Default: runtime.NumCPU()}, {
  97. Name: ConfigUASTFailOnErrors,
  98. Description: "Panic if there is a UAST extraction error.",
  99. Flag: "bblfsh-fail-on-error",
  100. Type: BoolConfigurationOption,
  101. Default: false}, {
  102. Name: ConfigUASTLanguages,
  103. Description: "Programming languages from which to extract UASTs. Separated by comma \",\".",
  104. Flag: "languages",
  105. Type: StringConfigurationOption,
  106. Default: "Python,Java"},
  107. }
  108. return options[:]
  109. }
  110. func (exr *UASTExtractor) Configure(facts map[string]interface{}) {
  111. if val, exists := facts[ConfigUASTEndpoint].(string); exists {
  112. exr.Endpoint = val
  113. }
  114. if val, exists := facts[ConfigUASTTimeout].(int); exists {
  115. exr.Context = func() (context.Context, context.CancelFunc) {
  116. return context.WithTimeout(context.Background(),
  117. time.Duration(val)*time.Second)
  118. }
  119. }
  120. if val, exists := facts[ConfigUASTPoolSize].(int); exists {
  121. exr.PoolSize = val
  122. }
  123. if val, exists := facts[ConfigUASTLanguages].(string); exists {
  124. exr.Languages = map[string]bool{}
  125. for _, lang := range strings.Split(val, ",") {
  126. exr.Languages[strings.TrimSpace(lang)] = true
  127. }
  128. }
  129. if val, exists := facts[ConfigUASTFailOnErrors].(bool); exists {
  130. exr.FailOnErrors = val
  131. }
  132. }
  133. func (exr *UASTExtractor) Initialize(repository *git.Repository) {
  134. if exr.Context == nil {
  135. exr.Context = func() (context.Context, context.CancelFunc) {
  136. return context.Background(), nil
  137. }
  138. }
  139. poolSize := exr.PoolSize
  140. if poolSize == 0 {
  141. poolSize = runtime.NumCPU()
  142. }
  143. var err error
  144. exr.clients = make([]*bblfsh.Client, poolSize)
  145. for i := 0; i < poolSize; i++ {
  146. client, err := bblfsh.NewClient(exr.Endpoint)
  147. if err != nil {
  148. panic(err)
  149. }
  150. exr.clients[i] = client
  151. }
  152. if exr.pool != nil {
  153. exr.pool.Close()
  154. }
  155. workers := make([]tunny.TunnyWorker, poolSize)
  156. for i := 0; i < poolSize; i++ {
  157. workers[i] = worker{Client: exr.clients[i], Job: exr.extractTask}
  158. }
  159. exr.pool, err = tunny.CreateCustomPool(workers).Open()
  160. if err != nil {
  161. panic(err)
  162. }
  163. exr.ProcessedFiles = map[string]int{}
  164. if exr.Languages == nil {
  165. exr.Languages = map[string]bool{}
  166. }
  167. }
  168. func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  169. cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
  170. treeDiffs := deps["changes"].(object.Changes)
  171. uasts := map[plumbing.Hash]*uast.Node{}
  172. lock := sync.RWMutex{}
  173. errs := make([]error, 0)
  174. status := make(chan int)
  175. pending := 0
  176. submit := func(change *object.Change) {
  177. {
  178. reader, err := cache[change.To.TreeEntry.Hash].Reader()
  179. if err != nil {
  180. errs = append(errs, err)
  181. return
  182. }
  183. defer ioutil.CheckClose(reader, &err)
  184. buf := new(bytes.Buffer)
  185. if _, err := buf.ReadFrom(reader); err != nil {
  186. errs = append(errs, err)
  187. return
  188. }
  189. lang := enry.GetLanguage(change.To.Name, buf.Bytes())
  190. if _, exists := exr.Languages[lang]; !exists {
  191. exr.ProcessedFiles[change.To.Name] = UAST_EXTRACTION_SKIPPED
  192. return
  193. }
  194. exr.ProcessedFiles[change.To.Name]++
  195. }
  196. pending++
  197. exr.pool.SendWorkAsync(uastTask{
  198. Lock: &lock,
  199. Dest: uasts,
  200. File: &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
  201. Errors: &errs, Status: status}, nil)
  202. }
  203. for _, change := range treeDiffs {
  204. action, err := change.Action()
  205. if err != nil {
  206. return nil, err
  207. }
  208. switch action {
  209. case merkletrie.Insert:
  210. submit(change)
  211. case merkletrie.Delete:
  212. continue
  213. case merkletrie.Modify:
  214. submit(change)
  215. }
  216. }
  217. for i := 0; i < pending; i++ {
  218. _ = <-status
  219. }
  220. if len(errs) > 0 {
  221. msgs := make([]string, len(errs))
  222. for i, err := range errs {
  223. msgs[i] = err.Error()
  224. }
  225. joined := strings.Join(msgs, "\n")
  226. if exr.FailOnErrors {
  227. return nil, errors.New(joined)
  228. } else {
  229. fmt.Fprintln(os.Stderr, joined)
  230. }
  231. }
  232. return map[string]interface{}{"uasts": uasts}, nil
  233. }
  234. func (exr *UASTExtractor) extractUAST(
  235. client *bblfsh.Client, file *object.File) (*uast.Node, error) {
  236. request := client.NewParseRequest()
  237. contents, err := file.Contents()
  238. if err != nil {
  239. return nil, err
  240. }
  241. request.Content(contents)
  242. request.Filename(file.Name)
  243. ctx, cancel := exr.Context()
  244. if cancel != nil {
  245. defer cancel()
  246. }
  247. response, err := request.DoWithContext(ctx)
  248. if err != nil {
  249. if strings.Contains("missing driver", err.Error()) {
  250. return nil, nil
  251. }
  252. return nil, err
  253. }
  254. if response.Status != protocol.Ok {
  255. return nil, errors.New(strings.Join(response.Errors, "\n"))
  256. }
  257. if err != nil {
  258. return nil, err
  259. }
  260. return response.UAST, nil
  261. }
  262. func (exr *UASTExtractor) extractTask(data interface{}) interface{} {
  263. task := data.(uastTask)
  264. defer func() { task.Status <- 0 }()
  265. node, err := exr.extractUAST(task.Client, task.File)
  266. task.Lock.Lock()
  267. defer task.Lock.Unlock()
  268. if err != nil {
  269. *task.Errors = append(*task.Errors, errors.New(task.File.Name+": "+err.Error()))
  270. return nil
  271. }
  272. if node != nil {
  273. task.Dest[task.File.Hash] = node
  274. }
  275. return nil
  276. }
  277. type UASTChange struct {
  278. Before *uast.Node
  279. After *uast.Node
  280. Change *object.Change
  281. }
  282. type UASTChanges struct {
  283. cache map[plumbing.Hash]*uast.Node
  284. }
  285. func (uc *UASTChanges) Name() string {
  286. return "UASTChanges"
  287. }
  288. func (uc *UASTChanges) Provides() []string {
  289. arr := [...]string{"changed_uasts"}
  290. return arr[:]
  291. }
  292. func (uc *UASTChanges) Requires() []string {
  293. arr := [...]string{"uasts", "changes"}
  294. return arr[:]
  295. }
  296. func (uc *UASTChanges) Features() []string {
  297. arr := [...]string{"uast"}
  298. return arr[:]
  299. }
  300. func (uc *UASTChanges) ListConfigurationOptions() []ConfigurationOption {
  301. return []ConfigurationOption{}
  302. }
  303. func (uc *UASTChanges) Configure(facts map[string]interface{}) {}
  304. func (uc *UASTChanges) Initialize(repository *git.Repository) {
  305. uc.cache = map[plumbing.Hash]*uast.Node{}
  306. }
  307. func (uc *UASTChanges) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  308. uasts := deps["uasts"].(map[plumbing.Hash]*uast.Node)
  309. treeDiffs := deps["changes"].(object.Changes)
  310. commit := make([]UASTChange, 0, len(treeDiffs))
  311. for _, change := range treeDiffs {
  312. action, err := change.Action()
  313. if err != nil {
  314. return nil, err
  315. }
  316. switch action {
  317. case merkletrie.Insert:
  318. hashTo := change.To.TreeEntry.Hash
  319. uastTo := uasts[hashTo]
  320. commit = append(commit, UASTChange{Before: nil, After: uastTo, Change: change})
  321. uc.cache[hashTo] = uastTo
  322. case merkletrie.Delete:
  323. hashFrom := change.From.TreeEntry.Hash
  324. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: nil, Change: change})
  325. delete(uc.cache, hashFrom)
  326. case merkletrie.Modify:
  327. hashFrom := change.From.TreeEntry.Hash
  328. hashTo := change.To.TreeEntry.Hash
  329. uastTo := uasts[hashTo]
  330. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: uastTo, Change: change})
  331. delete(uc.cache, hashFrom)
  332. uc.cache[hashTo] = uastTo
  333. }
  334. }
  335. return map[string]interface{}{"changed_uasts": commit}, nil
  336. }
  337. type UASTChangesSaver struct {
  338. // OutputPath points to the target directory with UASTs
  339. OutputPath string
  340. repository *git.Repository
  341. result [][]UASTChange
  342. }
  343. const (
  344. ConfigUASTChangesSaverOutputPath = "UASTChangesSaver.OutputPath"
  345. )
  346. func (saver *UASTChangesSaver) Name() string {
  347. return "UASTChangesSaver"
  348. }
  349. func (saver *UASTChangesSaver) Provides() []string {
  350. return []string{}
  351. }
  352. func (saver *UASTChangesSaver) Requires() []string {
  353. arr := [...]string{"changed_uasts"}
  354. return arr[:]
  355. }
  356. func (saver *UASTChangesSaver) Features() []string {
  357. arr := [...]string{"uast"}
  358. return arr[:]
  359. }
  360. func (saver *UASTChangesSaver) ListConfigurationOptions() []ConfigurationOption {
  361. options := [...]ConfigurationOption{{
  362. Name: ConfigUASTChangesSaverOutputPath,
  363. Description: "The target directory where to store the changed UAST files.",
  364. Flag: "changed-uast-dir",
  365. Type: StringConfigurationOption,
  366. Default: "."},
  367. }
  368. return options[:]
  369. }
  370. func (saver *UASTChangesSaver) Flag() string {
  371. return "dump-uast-changes"
  372. }
  373. func (saver *UASTChangesSaver) Configure(facts map[string]interface{}) {
  374. if val, exists := facts[ConfigUASTChangesSaverOutputPath]; exists {
  375. saver.OutputPath = val.(string)
  376. }
  377. }
  378. func (saver *UASTChangesSaver) Initialize(repository *git.Repository) {
  379. saver.repository = repository
  380. saver.result = [][]UASTChange{}
  381. }
  382. func (saver *UASTChangesSaver) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  383. changes := deps["changed_uasts"].([]UASTChange)
  384. saver.result = append(saver.result, changes)
  385. return nil, nil
  386. }
  387. func (saver *UASTChangesSaver) Finalize() interface{} {
  388. return saver.result
  389. }
  390. func (saver *UASTChangesSaver) Serialize(result interface{}, binary bool, writer io.Writer) error {
  391. saverResult := result.([][]UASTChange)
  392. fileNames := saver.dumpFiles(saverResult)
  393. if binary {
  394. return saver.serializeBinary(fileNames, writer)
  395. }
  396. saver.serializeText(fileNames, writer)
  397. return nil
  398. }
  399. func (saver *UASTChangesSaver) dumpFiles(result [][]UASTChange) []*pb.UASTChange {
  400. fileNames := []*pb.UASTChange{}
  401. for i, changes := range result {
  402. for j, change := range changes {
  403. if change.Before == nil || change.After == nil {
  404. continue
  405. }
  406. record := &pb.UASTChange{FileName: change.Change.To.Name}
  407. bs, _ := change.Before.Marshal()
  408. record.UastBefore = fmt.Sprintf(
  409. "%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String())
  410. goioutil.WriteFile(record.UastBefore, bs, 0666)
  411. blob, _ := saver.repository.BlobObject(change.Change.From.TreeEntry.Hash)
  412. s, _ := (&object.File{Blob: *blob}).Contents()
  413. record.SrcBefore = fmt.Sprintf(
  414. "%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String())
  415. goioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
  416. bs, _ = change.After.Marshal()
  417. record.UastAfter = fmt.Sprintf(
  418. "%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String())
  419. goioutil.WriteFile(record.UastAfter, bs, 0666)
  420. blob, _ = saver.repository.BlobObject(change.Change.To.TreeEntry.Hash)
  421. s, _ = (&object.File{Blob: *blob}).Contents()
  422. record.SrcAfter = fmt.Sprintf(
  423. "%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String())
  424. goioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
  425. fileNames = append(fileNames, record)
  426. }
  427. }
  428. return fileNames
  429. }
  430. func (saver *UASTChangesSaver) serializeText(result []*pb.UASTChange, writer io.Writer) {
  431. for _, sc := range result {
  432. kv := [...]string{
  433. "file: " + sc.FileName,
  434. "src0: " + sc.SrcBefore, "src1: " + sc.SrcAfter,
  435. "uast0: " + sc.UastBefore, "uast1: " + sc.UastAfter,
  436. }
  437. fmt.Fprintf(writer, " - {%s}\n", strings.Join(kv[:], ", "))
  438. }
  439. }
  440. func (saver *UASTChangesSaver) serializeBinary(result []*pb.UASTChange, writer io.Writer) error {
  441. message := pb.UASTChangesSaverResults{Changes: result}
  442. serialized, err := proto.Marshal(&message)
  443. if err != nil {
  444. return err
  445. }
  446. writer.Write(serialized)
  447. return nil
  448. }
  449. func init() {
  450. Registry.Register(&UASTExtractor{})
  451. Registry.Register(&UASTChanges{})
  452. Registry.Register(&UASTChangesSaver{})
  453. }